delete paddle/fluid/operators/*_mlu.* files (#52435)

delete paddle/fluid/operators/_mlu. files (#52435)
bb48b596 · Young-Flash · GitHub · 0e3f7ab1 · 0e3f7ab1 · 0e3f7ab1
70 changed file
--- a/paddle/fluid/operators/abs_op_mlu.cc
+++ b/paddle/fluid/operators/abs_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class AbsMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc output_desc(*output);
-    MLUCnnl::Abs(ctx,
-                 input_desc.get(),
-                 GetBasePtr(input),
-                 output_desc.get(),
-                 GetBasePtr(output));
-  }
-};
-template <typename T>
-class AbsGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_desc(*x);
-    MLUCnnlOpTensorDesc mul_op_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    phi::DenseTensor sign_x;
-    sign_x.mutable_data<T>(x->dims(), ctx.GetPlace());
-    MLUCnnl::Sign(ctx,
-                  input_desc.get(),
-                  GetBasePtr(x),
-                  input_desc.get(),
-                  GetBasePtr(&sign_x));
-    MLUCnnl::OpTensor(ctx,
-                      mul_op_desc.get(),
-                      input_desc.get(),
-                      GetBasePtr(&sign_x),
-                      input_desc.get(),
-                      GetBasePtr(dout),
-                      input_desc.get(),
-                      GetBasePtr(dx),
-                      ToCnnlDataType<T>());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(abs,
-                       ops::AbsMLUKernel<float>,
-                       ops::AbsMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(abs_grad,
-                       ops::AbsGradMLUKernel<float>,
-                       ops::AbsGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/activation_op_mlu.cc
+++ b/paddle/fluid/operators/activation_op_mlu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <cnnlActivationMode_t act_mode, typename T>
-class ActivationMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
-    output->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlActivationDesc act_desc(act_mode, alpha);
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc output_desc(*output);
-    MLUCnnl::Active(ctx,
-                    act_desc.get(),
-                    input_desc.get(),
-                    GetBasePtr(input),
-                    output_desc.get(),
-                    GetBasePtr(output));
-  }
-};
-// For gelu, leaky_relu
-template <cnnlActivationMode_t act_mode, typename T>
-class ActivationGradMLUKernelV1 : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
-    dx->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlTensorDesc dx_desc(*dx);
-    MLUCnnlActivationDesc act_desc(act_mode, alpha);
-    MLUCnnl::ActiveGrad(ctx,
-                        act_desc.get(),
-                        nullptr,
-                        nullptr,
-                        nullptr,
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        x_desc.get(),
-                        GetBasePtr(x),
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-  }
-};
-// For tanh, sigmoid
-template <cnnlActivationMode_t act_mode, typename T>
-class ActivationGradMLUKernelV2 : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
-    dx->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlTensorDesc dx_desc(*dx);
-    MLUCnnlActivationDesc act_desc(act_mode, alpha);
-    MLUCnnl::ActiveGrad(ctx,
-                        act_desc.get(),
-                        nullptr,
-                        nullptr,
-                        out_desc.get(),
-                        GetBasePtr(out),
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        nullptr,
-                        nullptr,
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-  }
-};
-// For relu, relu6
-template <cnnlActivationMode_t act_mode, typename T>
-class ActivationGradMLUKernelV3 : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
-    dx->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlTensorDesc dx_desc(*dx);
-    MLUCnnlActivationDesc act_desc(act_mode, alpha);
-    MLUCnnl::ActiveGrad(ctx,
-                        act_desc.get(),
-                        nullptr,
-                        nullptr,
-                        nullptr,
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        out_desc.get(),
-                        GetBasePtr(out),
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-  }
-};
-// For sqrt
-template <typename T>
-class SqrtMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-    MLUCnnlTensorDesc input_desc(*x);
-    MLUCnnlTensorDesc output_desc(*out);
-    cnnlComputationPreference_t prefer = CNNL_COMPUTATION_FAST;
-    MLUCnnl::Sqrt(ctx,
-                  prefer,
-                  input_desc.get(),
-                  GetBasePtr(x),
-                  output_desc.get(),
-                  GetBasePtr(out));
-  }
-};
-template <typename T>
-class SqrtGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto place = ctx.GetPlace();
-    dx->mutable_data<T>(place);
-    MLUCnnlTensorDesc data_desc(*out);
-    MLUCnnl::SqrtGrad(ctx,
-                      data_desc.get(),
-                      GetBasePtr(out),
-                      GetBasePtr(dout),
-                      GetBasePtr(dx));
-  }
-};
-// CNNL_LOG_E = 0,
-// CNNL_LOG_2 = 1,
-// CNNL_LOG_10 = 2,
-template <cnnlLogBase_t Log_base, typename T>
-class LogMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc output_desc(*output);
-    cnnlComputationPreference_t prefer = CNNL_COMPUTATION_HIGH_PRECISION;
-    MLUCnnl::Log(ctx,
-                 prefer,
-                 Log_base,
-                 input_desc.get(),
-                 GetBasePtr(input),
-                 output_desc.get(),
-                 GetBasePtr(output));
-  }
-};
-template <typename T>
-class ExpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc output_desc(*output);
-    cnnlComputationPreference_t prefer = CNNL_COMPUTATION_HIGH_PRECISION;
-    MLUCnnl::Exp(ctx,
-                 prefer,
-                 input_desc.get(),
-                 GetBasePtr(input),
-                 output_desc.get(),
-                 GetBasePtr(output));
-  }
-};
-template <typename T>
-class ExpGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlTensorDesc dx_desc(*dx);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnlOpTensorDesc op_tensor_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx,
-                      op_tensor_desc.get(),
-                      dout_desc.get(),
-                      GetBasePtr(dout),
-                      out_desc.get(),
-                      GetBasePtr(out),
-                      dx_desc.get(),
-                      GetBasePtr(dx),
-                      ToCnnlDataType<T>());
-  }
-};
-template <typename T>
-class HardSwishMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
-    float threshold = ctx.Attr<float>("threshold");
-    float scale = ctx.Attr<float>("scale");
-    float offset = ctx.Attr<float>("offset");
-    PADDLE_ENFORCE_EQ(threshold,
-                      6.0f,
-                      platform::errors::External(
-                          "Not support threshold [%f] in MLU", threshold));
-    PADDLE_ENFORCE_EQ(
-        scale,
-        6.0f,
-        platform::errors::External("Not support scale [%f] in MLU", scale));
-    PADDLE_ENFORCE_EQ(
-        offset,
-        3.0f,
-        platform::errors::External("Not support offset [%f] in MLU", offset));
-    MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSWISH,
-                                   1.0f /*ceof useless*/);
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc output_desc(*output);
-    MLUCnnl::Active(ctx,
-                    act_desc.get(),
-                    input_desc.get(),
-                    GetBasePtr(input),
-                    output_desc.get(),
-                    GetBasePtr(output));
-  }
-};
-template <typename T>
-class HardSwishGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    float threshold = ctx.Attr<float>("threshold");
-    float scale = ctx.Attr<float>("scale");
-    float offset = ctx.Attr<float>("offset");
-    PADDLE_ENFORCE_EQ(threshold,
-                      6.0f,
-                      platform::errors::External(
-                          "Not support threshold [%f] in MLU", threshold));
-    PADDLE_ENFORCE_EQ(
-        scale,
-        6.0f,
-        platform::errors::External("Not support scale [%f] in MLU", scale));
-    PADDLE_ENFORCE_EQ(
-        offset,
-        3.0f,
-        platform::errors::External("Not support offset [%f] in MLU", offset));
-    auto* out = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlTensorDesc dx_desc(*dx);
-    MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSWISH,
-                                   1.0f /*ceof useless*/);
-    MLUCnnl::ActiveGrad(ctx,
-                        act_desc.get(),
-                        nullptr,
-                        nullptr,
-                        nullptr,
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        out_desc.get(),
-                        GetBasePtr(out),
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-  }
-};
-template <typename T>
-class HardSigmoidMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    float slope = ctx.Attr<float>("slope");
-    float offset = ctx.Attr<float>("offset");
-    output->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSIGMOID,
-                                   1.0f /*ceof useless*/,
-                                   1.0f /*sliced_dim useless*/,
-                                   slope,
-                                   offset);
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc output_desc(*output);
-    MLUCnnl::Active(ctx,
-                    act_desc.get(),
-                    input_desc.get(),
-                    GetBasePtr(input),
-                    output_desc.get(),
-                    GetBasePtr(output));
-  }
-};
-template <typename T>
-class HardSigmoidGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    float slope = ctx.Attr<float>("slope");
-    float offset = ctx.Attr<float>("offset");
-    dx->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSIGMOID,
-                                   1.0f /*ceof useless*/,
-                                   1.0f /*sliced_dim useless*/,
-                                   slope,
-                                   offset);
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlTensorDesc dx_desc(*dx);
-    MLUCnnl::ActiveGrad(ctx,
-                        act_desc.get(),
-                        nullptr,
-                        nullptr,
-                        nullptr,
-                        nullptr,
-                        dout_desc.get(),
-                        GetBasePtr(dout),
-                        x_desc.get(),
-                        GetBasePtr(x),
-                        dx_desc.get(),
-                        GetBasePtr(dx));
-  }
-};
-template <typename T>
-class FloorMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc output_desc(*output);
-    MLUCnnl::Floor(ctx,
-                   input_desc.get(),
-                   GetBasePtr(input),
-                   output_desc.get(),
-                   GetBasePtr(output));
-  }
-};
-template <typename DeviceContext, typename T>
-class ReciprocalMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::Reciprocal(
-        ctx, x_desc.get(), GetBasePtr(x), out_desc.get(), GetBasePtr(out));
-  }
-};
-template <typename DeviceContext, typename T>
-class ReciprocalGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto place = ctx.GetPlace();
-    dx->mutable_data<T>(place);
-    phi::DenseTensor square_out;
-    square_out.Resize(out->dims());
-    square_out.mutable_data<T>(place);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlTensorDesc dx_desc(*dx);
-    MLUCnnlTensorDesc square_out_desc(square_out);
-    MLUCnnl::Square(ctx,
-                    out_desc.get(),
-                    GetBasePtr(out),
-                    square_out_desc.get(),
-                    GetBasePtr(&square_out));
-    cnnlOpTensorDesc_t op_tensor_op = CNNL_OP_TENSOR_MUL;
-    cnnlDataType_t op_tensor_comp_type = CNNL_DTYPE_FLOAT;
-    cnnlNanPropagation_t op_tensor_nan_opt = CNNL_NOT_PROPAGATE_NAN;
-    MLUCnnlOpTensorDesc op_tensor_desc(
-        op_tensor_op, op_tensor_comp_type, op_tensor_nan_opt);
-    float alpha1_float = -1;
-    float alpha2_float = 1;
-    float beta_float = 0;
-    MLUCnnl::OpTensor(ctx,
-                      op_tensor_desc.get(),
-                      dout_desc.get(),
-                      GetBasePtr(dout),
-                      square_out_desc.get(),
-                      GetBasePtr(&square_out),
-                      dx_desc.get(),
-                      GetBasePtr(dx),
-                      op_tensor_comp_type,
-                      alpha1_float,
-                      alpha2_float,
-                      beta_float);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-// reciprocal
-REGISTER_OP_MLU_KERNEL(
-    reciprocal,
-    ops::ReciprocalMLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::ReciprocalMLUKernel<paddle::platform::MLUDeviceContext,
-                             paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    reciprocal_grad,
-    ops::ReciprocalGradMLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::ReciprocalGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                 paddle::platform::float16>);
-// relu
-REGISTER_OP_MLU_KERNEL(
-    relu,
-    ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU, float>,
-    ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU, paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    relu_grad,
-    ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU, float>,
-    ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU,
-                                   paddle::platform::float16>);
-// relu6
-REGISTER_OP_MLU_KERNEL(
-    relu6,
-    ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU6, float>,
-    ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU6, paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    relu6_grad,
-    ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU6, float>,
-    ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU6,
-                                   paddle::platform::float16>);
-// sigmoid
-REGISTER_OP_MLU_KERNEL(sigmoid,
-                       ops::ActivationMLUKernel<CNNL_ACTIVATION_SIGMOID, float>,
-                       ops::ActivationMLUKernel<CNNL_ACTIVATION_SIGMOID,
-                                                paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    sigmoid_grad,
-    ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_SIGMOID, float>,
-    ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_SIGMOID,
-                                   paddle::platform::float16>);
-// tanh
-REGISTER_OP_MLU_KERNEL(
-    tanh,
-    ops::ActivationMLUKernel<CNNL_ACTIVATION_TANH, float>,
-    ops::ActivationMLUKernel<CNNL_ACTIVATION_TANH, paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    tanh_grad,
-    ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_TANH, float>,
-    ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_TANH,
-                                   paddle::platform::float16>);
-// gelu
-REGISTER_OP_MLU_KERNEL(
-    gelu,
-    ops::ActivationMLUKernel<CNNL_ACTIVATION_GELU, float>,
-    ops::ActivationMLUKernel<CNNL_ACTIVATION_GELU, paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    gelu_grad,
-    ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_GELU, float>,
-    ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_GELU,
-                                   paddle::platform::float16>);
-// leaky_relu
-REGISTER_OP_MLU_KERNEL(
-    leaky_relu,
-    ops::ActivationMLUKernel<CNNL_ACTIVATION_LEAKYRELU, float>,
-    ops::ActivationMLUKernel<CNNL_ACTIVATION_LEAKYRELU,
-                             paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    leaky_relu_grad,
-    ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_LEAKYRELU, float>,
-    ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_LEAKYRELU,
-                                   paddle::platform::float16>);
-// sqrt
-REGISTER_OP_MLU_KERNEL(sqrt,
-                       ops::SqrtMLUKernel<float>,
-                       ops::SqrtMLUKernel<paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(sqrt_grad,
-                       ops::SqrtGradMLUKernel<float>,
-                       ops::SqrtGradMLUKernel<paddle::platform::float16>);
-// log log2 log10
-REGISTER_OP_MLU_KERNEL(
-    log,
-    ops::LogMLUKernel<CNNL_LOG_E, float>,
-    ops::LogMLUKernel<CNNL_LOG_E, paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    log2,
-    ops::LogMLUKernel<CNNL_LOG_2, float>,
-    ops::LogMLUKernel<CNNL_LOG_2, paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    log10,
-    ops::LogMLUKernel<CNNL_LOG_10, float>,
-    ops::LogMLUKernel<CNNL_LOG_10, paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(exp,
-                       ops::ExpMLUKernel<float>,
-                       ops::ExpMLUKernel<paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(exp_grad,
-                       ops::ExpGradMLUKernel<float>,
-                       ops::ExpGradMLUKernel<paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(hard_swish,
-                       ops::HardSwishMLUKernel<float>,
-                       ops::HardSwishMLUKernel<paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(hard_swish_grad,
-                       ops::HardSwishGradMLUKernel<float>,
-                       ops::HardSwishGradMLUKernel<paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(hard_sigmoid,
-                       ops::HardSigmoidMLUKernel<float>,
-                       ops::HardSigmoidMLUKernel<paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    hard_sigmoid_grad,
-    ops::HardSigmoidGradMLUKernel<float>,
-    ops::HardSigmoidGradMLUKernel<paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(floor,
-                       ops::FloorMLUKernel<float>,
-                       ops::FloorMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/arg_max_op_mlu.cc
+++ b/paddle/fluid/operators/arg_max_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ArgMaxMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto axis = static_cast<int>(ctx.Attr<int64_t>("axis"));
-    auto dtype = ctx.Attr<int>("dtype");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-    if (x->numel() == 0) return;
-    PADDLE_ENFORCE_EQ(
-        (dtype == 2 || dtype == 3),
-        true,
-        platform::errors::InvalidArgument(
-            "The attribute of dtype in argmax op must be [%s] or [%s], "
-            "but "
-            "received [%s]",
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                static_cast<framework::proto::VarType::Type>(dtype))));
-    if (axis < 0) {
-      framework::DDim x_dims;
-      x_dims = x->dims();
-      axis += x_dims.size();
-    }
-    phi::DenseTensor flatten_x(x->type());
-    flatten_x.ShareDataWith(*x);
-    if (flatten) {
-      flatten_x.Resize(phi::make_ddim({x->numel()}));
-      // if flatten, the axis just as 0
-      axis = 0;
-    }
-    std::vector<int> reduce_dims;
-    reduce_dims.push_back(axis);
-    auto out_dims = out->dims();
-    int out_count = out_dims[0];
-    for (int i = 1; i < out_dims.size(); i++) {
-      out_count = out_count * out_dims[i];
-    }
-    size_t indices_size_inbytes = out_count * sizeof(int32_t);
-    auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
-    phi::DenseTensor value_out =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(out->dims(), dev_ctx);
-    MLUCnnlTensorDesc value_out_desc(value_out);
-    MLUCnnlTensorDesc input_desc(
-        flatten_x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(flatten_x.dtype()));
-    MLUCnnlReduceDesc reduction_desc(reduce_dims,
-                                     CNNL_REDUCE_MAX,
-                                     ToCnnlDataType<T>(),
-                                     CNNL_NOT_PROPAGATE_NAN,
-                                     CNNL_REDUCE_ONLY_INDICES,
-                                     CNNL_32BIT_INDICES);
-    if (dtype == 2) {
-      out->template mutable_data<int32_t>(ctx.GetPlace());
-      MLUCnnl::Reduce(ctx,
-                      true /*need_workspace*/,
-                      reduction_desc.get(),
-                      nullptr,
-                      input_desc.get(),
-                      GetBasePtr(&flatten_x),
-                      indices_size_inbytes /*indices_size*/,
-                      GetBasePtr(out),
-                      nullptr,
-                      value_out_desc.get(),
-                      GetBasePtr(&value_out));
-    } else {
-      out->template mutable_data<int64_t>(ctx.GetPlace());
-      phi::DenseTensor out_int32 =
-          ctx.AllocateTmpTensor<int32_t, MLUDeviceContext>(out->dims(),
-                                                           dev_ctx);
-      MLUCnnl::Reduce(ctx,
-                      true /*need_workspace*/,
-                      reduction_desc.get(),
-                      nullptr,
-                      input_desc.get(),
-                      GetBasePtr(&flatten_x),
-                      indices_size_inbytes /*indices_size*/,
-                      GetBasePtr(&out_int32),
-                      nullptr,
-                      value_out_desc.get(),
-                      GetBasePtr(&value_out));
-      // cast indices type to int64
-      MLUCnnlTensorDesc out_int32_desc(out_int32);
-      MLUCnnlTensorDesc cast_output_desc(*out);
-      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    out_int32_desc.get(),
-                    GetBasePtr(&out_int32),
-                    cast_output_desc.get(),
-                    GetBasePtr(out));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(arg_max,
-                       ops::ArgMaxMLUKernel<int>,
-                       ops::ArgMaxMLUKernel<float>,
-                       ops::ArgMaxMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/argsort_op_mlu.cc
+++ b/paddle/fluid/operators/argsort_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ArgsortMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
-    const auto& place = ctx.GetPlace();
-    const auto& sorted = true;
-    const bool descending = ctx.Attr<bool>("descending");
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    if (axis < 0) {
-      const auto& in_dims = input->dims();
-      axis += in_dims.size();
-    }
-    auto in_dims = input->dims();
-    size_t k = in_dims[axis];
-    output->mutable_data<T>(place);
-    indices->mutable_data<int64_t>(place);
-    // cnnl only support int32/int16 type of indices
-    phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
-    indices_int32.Resize(indices->dims());
-    indices_int32.mutable_data<int32_t>(place);
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc values_output_desc(*output);
-    MLUCnnlTensorDesc indices_int32_desc(indices_int32);
-    MLUCnnl::TopK(ctx,
-                  k,
-                  axis,
-                  descending,
-                  sorted,
-                  input_desc.get(),
-                  GetBasePtr(input),
-                  values_output_desc.get(),
-                  GetBasePtr(output),
-                  indices_int32_desc.get(),
-                  GetBasePtr(&indices_int32));
-    // cast indices type to int64
-    MLUCnnlTensorDesc cast_output_desc(*indices);
-    cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
-    MLUCnnl::Cast(ctx,
-                  cast_type,
-                  indices_int32_desc.get(),
-                  GetBasePtr(&indices_int32),
-                  cast_output_desc.get(),
-                  GetBasePtr(indices));
-  }
-};
-template <typename T>
-class ArgsortGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-    dx->mutable_data<T>(ctx.GetPlace());
-    auto in_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    if (dout->numel() == 0) return;
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlTensorDesc indices_desc(*indices);
-    MLUCnnlTensorDesc dx_desc(*dx);
-    MLUCnnl::ScatterFunctor(ctx,
-                            dx_desc.get(),
-                            GetBasePtr(dx),
-                            dout_desc.get(),
-                            GetBasePtr(dout),
-                            indices_desc.get(),
-                            GetBasePtr(indices),
-                            axis);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(argsort,
-                       ops::ArgsortMLUKernel<paddle::platform::float16>,
-                       ops::ArgsortMLUKernel<float>,
-                       ops::ArgsortMLUKernel<int8_t>,
-                       ops::ArgsortMLUKernel<uint8_t>,
-                       ops::ArgsortMLUKernel<int16_t>,
-                       ops::ArgsortMLUKernel<int>);
-REGISTER_OP_MLU_KERNEL(argsort_grad,
-                       ops::ArgsortGradMLUKernel<paddle::platform::float16>,
-                       ops::ArgsortGradMLUKernel<float>,
-                       ops::ArgsortGradMLUKernel<int8_t>,
-                       ops::ArgsortGradMLUKernel<uint8_t>,
-                       ops::ArgsortGradMLUKernel<int16_t>,
-                       ops::ArgsortGradMLUKernel<int>);
--- a/paddle/fluid/operators/assign_op_mlu.cc
+++ b/paddle/fluid/operators/assign_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string>
-#include "paddle/fluid/operators/assign_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/platform/float16.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class AssignMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::Assign(
-        ctx, x_desc.get(), GetBasePtr(x), out_desc.get(), GetBasePtr(out));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(assign,
-                       ops::AssignMLUKernel<int>,
-                       ops::AssignMLUKernel<float>,
-                       ops::AssignMLUKernel<plat::float16>,
-                       ops::AssignMLUKernel<bool>)
--- a/paddle/fluid/operators/assign_value_op_mlu.cc
+++ b/paddle/fluid/operators/assign_value_op_mlu.cc
-//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/assign_value_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(assign_value,
-                       ops::AssignValueKernel<bool>,
-                       ops::AssignValueKernel<int>,
-                       ops::AssignValueKernel<int64_t>,
-                       ops::AssignValueKernel<float>);
--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class MLUBatchNormOpKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto &place = ctx.GetPlace();
-    const float epsilon = ctx.Attr<float>("epsilon");
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    bool test_mode = is_test && (!trainable_stats);
-    bool global_stats = test_mode || use_global_stats;
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(
-        x_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be larger than 1."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        x_dims.size(),
-        5,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be less than 6."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-    const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
-    const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto *y = ctx.Output<phi::DenseTensor>("Y");
-    auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
-    auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
-    auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-    // alloc memory
-    y->mutable_data<T>(place);
-    mean_out->mutable_data<MPDType>(place);
-    variance_out->mutable_data<MPDType>(place);
-    saved_mean->mutable_data<MPDType>(place);
-    saved_variance->mutable_data<MPDType>(place);
-    phi::DenseTensor transformed_x;
-    phi::DenseTensor transformed_y;
-    const int transformed_dim_size = 4;
-    const int transformed_shape[transformed_dim_size] = {N, sample_size, 1, C};
-    MLUCnnlTensorDesc transformed_desc(transformed_dim_size,
-                                       transformed_shape,
-                                       ToCnnlDataType<T>(),
-                                       CNNL_LAYOUT_NHWC);
-    MLUCnnlTensorDesc others_input_desc(*scale);
-    // input dimension is 2 and the format is NCHW. The input can be regarded as
-    // NHWC format. Don't need to transpose.
-    bool need_transpose =
-        (data_layout == DataLayout::kNCHW && x_dims.size() != 2);
-    if (need_transpose) {
-      auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
-      transformed_x = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
-      transformed_y = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
-      const int x_reshaped[] = {N, C, sample_size, 1};
-      MLUCnnlTensorDesc x_reshaped_desc(
-          transformed_dim_size, x_reshaped, ToCnnlDataType<T>());
-      const std::vector<int> perm = {0, 2, 3, 1};
-      MLUCnnl::Transpose(ctx,
-                         perm,
-                         transformed_dim_size,
-                         x_reshaped_desc.get(),
-                         GetBasePtr(x),
-                         transformed_desc.get(),
-                         GetBasePtr(&transformed_x));
-    } else {
-      transformed_x = *x;
-      transformed_y = *y;
-    }
-    if (ctx.HasInput("MomentumTensor")) {
-      const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-      phi::DenseTensor mom_cpu;
-      framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
-      momentum = mom_cpu.data<float>()[0];
-    }
-    MLUCnnl::FusedBatchNorm(ctx,
-                            !global_stats,
-                            transformed_desc.get(),
-                            GetBasePtr(&transformed_x),
-                            others_input_desc.get(),
-                            GetBasePtr(scale),
-                            GetBasePtr(bias),
-                            GetBasePtr(running_mean),
-                            GetBasePtr(running_var),
-                            epsilon,
-                            momentum,
-                            transformed_desc.get(),
-                            GetBasePtr(&transformed_y),
-                            GetBasePtr(mean_out),
-                            GetBasePtr(variance_out),
-                            GetBasePtr(saved_mean),
-                            GetBasePtr(saved_variance));
-    if (need_transpose) {
-      const int y_reshaped[] = {N, C, sample_size, 1};
-      MLUCnnlTensorDesc y_reshaped_desc(
-          transformed_dim_size, y_reshaped, ToCnnlDataType<T>());
-      const std::vector<int> perm = {0, 3, 1, 2};
-      MLUCnnl::Transpose(ctx,
-                         perm,
-                         transformed_y.dims().size(),
-                         transformed_desc.get(),
-                         GetBasePtr(&transformed_y),
-                         y_reshaped_desc.get(),
-                         GetBasePtr(y));
-    }
-  }
-};
-template <typename T>
-class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
-    // SavedVariance have been reverted in forward operator
-    const auto *saved_inv_variance =
-        ctx.Input<phi::DenseTensor>("SavedVariance");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const float epsilon = ctx.Attr<float>("epsilon");
-    DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-    auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
-    auto d_x_tmp =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(x->dims(), dev_ctx);
-    auto scale_grad_tmp = ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(
-        scale->dims(), dev_ctx);
-    auto bias_grad_tmp =
-        ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(bias->dims(), dev_ctx);
-    if (d_x == nullptr) {
-      d_x = &d_x_tmp;
-    }
-    if (d_scale == nullptr) {
-      d_scale = &scale_grad_tmp;
-    }
-    if (d_bias == nullptr) {
-      d_bias = &bias_grad_tmp;
-    }
-    const auto &place = ctx.GetPlace();
-    d_x->mutable_data<T>(place);
-    d_scale->mutable_data<MPDType>(place);
-    d_bias->mutable_data<MPDType>(place);
-    use_global_stats = is_test || use_global_stats;
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(
-        x_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be larger than 1."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        x_dims.size(),
-        5,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be less than 6."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-    phi::DenseTensor transformed_d_y;
-    phi::DenseTensor transformed_x;
-    phi::DenseTensor transformed_d_x;
-    const int transformed_dim_size = 4;
-    const int transformed_shape[transformed_dim_size] = {N, sample_size, 1, C};
-    MLUCnnlTensorDesc transformed_desc(transformed_dim_size,
-                                       transformed_shape,
-                                       ToCnnlDataType<T>(),
-                                       CNNL_LAYOUT_NHWC);
-    MLUCnnlTensorDesc others_input_desc(*scale);
-    bool need_transpose =
-        (data_layout == DataLayout::kNCHW && x_dims.size() != 2);
-    if (need_transpose) {
-      transformed_d_y = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
-      transformed_x = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
-      transformed_d_x = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
-      const int org_reshaped[] = {N, C, sample_size, 1};
-      MLUCnnlTensorDesc org_reshaped_desc(
-          transformed_dim_size, org_reshaped, ToCnnlDataType<T>());
-      const std::vector<int> perm = {0, 2, 3, 1};
-      MLUCnnl::Transpose(ctx,
-                         perm,
-                         transformed_dim_size,
-                         org_reshaped_desc.get(),
-                         GetBasePtr(d_y),
-                         transformed_desc.get(),
-                         GetBasePtr(&transformed_d_y));
-      MLUCnnl::Transpose(ctx,
-                         perm,
-                         transformed_dim_size,
-                         org_reshaped_desc.get(),
-                         GetBasePtr(x),
-                         transformed_desc.get(),
-                         GetBasePtr(&transformed_x));
-    } else {
-      transformed_d_y = *d_y;
-      transformed_x = *x;
-      transformed_d_x = *d_x;
-    }
-    if (use_global_stats) {
-      const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
-      const auto *running_variance = ctx.Input<phi::DenseTensor>("Variance");
-      MLUCnnl::FusedBatchNormGrad(ctx,
-                                  false /*is_training*/,
-                                  transformed_desc.get(),
-                                  GetBasePtr(&transformed_d_y),
-                                  transformed_desc.get(),
-                                  GetBasePtr(&transformed_x),
-                                  others_input_desc.get(),
-                                  GetBasePtr(scale),
-                                  GetBasePtr(running_mean),
-                                  GetBasePtr(running_variance),
-                                  epsilon,
-                                  transformed_desc.get(),
-                                  GetBasePtr(&transformed_d_x),
-                                  GetBasePtr(d_scale),
-                                  GetBasePtr(d_bias));
-    } else {
-      MLUCnnl::FusedBatchNormGrad(ctx,
-                                  true /*is_training*/,
-                                  transformed_desc.get(),
-                                  GetBasePtr(&transformed_d_y),
-                                  transformed_desc.get(),
-                                  GetBasePtr(&transformed_x),
-                                  others_input_desc.get(),
-                                  GetBasePtr(scale),
-                                  GetBasePtr(saved_mean),
-                                  GetBasePtr(saved_inv_variance),
-                                  epsilon,
-                                  transformed_desc.get(),
-                                  GetBasePtr(&transformed_d_x),
-                                  GetBasePtr(d_scale),
-                                  GetBasePtr(d_bias));
-    }
-    if (need_transpose) {
-      const int d_x_reshaped[] = {N, C, sample_size, 1};
-      MLUCnnlTensorDesc d_x_reshaped_desc(
-          transformed_dim_size, d_x_reshaped, ToCnnlDataType<T>());
-      const std::vector<int> perm = {0, 3, 1, 2};
-      MLUCnnl::Transpose(ctx,
-                         perm,
-                         transformed_dim_size,
-                         transformed_desc.get(),
-                         GetBasePtr(&transformed_d_x),
-                         d_x_reshaped_desc.get(),
-                         GetBasePtr(d_x));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(batch_norm,
-                       ops::MLUBatchNormOpKernel<float>,
-                       ops::MLUBatchNormOpKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(batch_norm_grad,
-                       ops::MLUBatchNormGradOpKernel<float>,
-                       ops::MLUBatchNormGradOpKernel<plat::float16>);
--- a/paddle/fluid/operators/bce_loss_op_mlu.cc
+++ b/paddle/fluid/operators/bce_loss_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class BCELossMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc label_desc(*labels);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::BceLoss(ctx,
-                     CNNL_BCE_LOSS_NONE,
-                     x_desc.get(),
-                     GetBasePtr(x),
-                     label_desc.get(),
-                     GetBasePtr(labels),
-                     nullptr,
-                     nullptr,
-                     out_desc.get(),
-                     GetBasePtr(out));
-  }
-};
-template <typename T>
-class BCELossGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc label_desc(*labels);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnl::BceLossBackward(ctx,
-                             CNNL_BCE_LOSS_NONE,
-                             dout_desc.get(),
-                             GetBasePtr(dout),
-                             x_desc.get(),
-                             GetBasePtr(x),
-                             label_desc.get(),
-                             GetBasePtr(labels),
-                             nullptr,
-                             nullptr,
-                             x_desc.get(),
-                             GetBasePtr(dx));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(bce_loss,
-                       ops::BCELossMLUKernel<float>,
-                       ops::BCELossMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(bce_loss_grad,
-                       ops::BCELossGradMLUKernel<float>,
-                       ops::BCELossGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/cast_op_mlu.cc
+++ b/paddle/fluid/operators/cast_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class CastMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto src_type = static_cast<VT::Type>(ctx.Attr<int>("in_dtype"));
-    auto dst_type = static_cast<VT::Type>(ctx.Attr<int>("out_dtype"));
-    auto place = ctx.GetPlace();
-    if (src_type == dst_type) {
-      auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-      output->mutable_data<T>(place);
-      framework::TensorCopy(*input, place, dev_ctx, output);
-      return;
-    }
-    PADDLE_ENFORCE_EQ(MLUSupportsCast(src_type, dst_type),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "MLU not support cast [%d] to [%d]",
-                          framework::DataTypeToString(src_type),
-                          framework::DataTypeToString(dst_type)));
-    output->mutable_data(place, framework::TransToPhiDataType(dst_type));
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc output_desc(*output);
-    cnnlCastDataType_t cast_type = GetCastDataType(src_type, dst_type);
-    MLUCnnl::Cast(ctx,
-                  cast_type,
-                  input_desc.get(),
-                  GetBasePtr(input),
-                  output_desc.get(),
-                  GetBasePtr(output));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(cast,
-                       ops::CastMLUKernel<float>,
-                       ops::CastMLUKernel<int>,
-                       ops::CastMLUKernel<int16_t>,
-                       ops::CastMLUKernel<uint8_t>,
-                       ops::CastMLUKernel<bool>,
-                       ops::CastMLUKernel<int64_t>,
-                       ops::CastMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/clip_op_mlu.cc
+++ b/paddle/fluid/operators/clip_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ClipMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto min = static_cast<T>(ctx.Attr<float>("min"));
-    auto max = static_cast<T>(ctx.Attr<float>("max"));
-    if (ctx.HasInput("Min")) {
-      phi::DenseTensor min_cpu;
-      auto* min_tensor = ctx.Input<phi::DenseTensor>("Min");
-      auto* min_data = min_tensor->data<T>();
-      if (platform::is_mlu_place(min_tensor->place())) {
-        paddle::framework::TensorCopySync(
-            *min_tensor, platform::CPUPlace(), &min_cpu);
-        min_data = min_cpu.data<T>();
-      }
-      min = min_data[0];
-    }
-    if (ctx.HasInput("Max")) {
-      phi::DenseTensor max_cpu;
-      auto* max_tensor = ctx.Input<phi::DenseTensor>("Max");
-      auto* max_data = max_tensor->data<T>();
-      if (platform::is_mlu_place(max_tensor->place())) {
-        paddle::framework::TensorCopySync(
-            *max_tensor, platform::CPUPlace(), &max_cpu);
-        max_data = max_cpu.data<T>();
-      }
-      max = max_data[0];
-    }
-    out->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::Clip(ctx,
-                  x_desc.get(),
-                  GetBasePtr(x),
-                  static_cast<const void*>(&min),
-                  static_cast<const void*>(&max),
-                  GetBasePtr(out));
-  }
-};
-template <typename T>
-class ClipGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    auto* min_tensor =
-        ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
-    auto* max_tensor =
-        ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
-    auto min_val = ctx.Attr<float>("min");
-    if (min_tensor) {
-      phi::DenseTensor min_data;
-      framework::TensorCopy(
-          *min_tensor,
-          platform::CPUPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &min_data);
-      ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
-      min_val = static_cast<float>(min_data.data<T>()[0]);
-    }
-    auto max_val = ctx.Attr<float>("max");
-    if (max_tensor) {
-      phi::DenseTensor max_data;
-      framework::TensorCopy(
-          *max_tensor,
-          platform::CPUPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &max_data);
-      ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
-      max_val = static_cast<float>(max_data.data<T>()[0]);
-    }
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc dx_desc(*dx);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnl::HardtanhBackward(ctx,
-                              x_desc.get(),
-                              GetBasePtr(x),
-                              dout_desc.get(),
-                              GetBasePtr(dout),
-                              max_val,
-                              min_val,
-                              dx_desc.get(),
-                              GetBasePtr(dx));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(clip,
-                       ops::ClipMLUKernel<float>,
-                       ops::ClipMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(clip_grad,
-                       ops::ClipGradMLUKernel<float>,
-                       ops::ClipGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/concat_op_mlu.cc
+++ b/paddle/fluid/operators/concat_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/concat_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/phi/core/tensor_utils.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ConcatMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
-    PADDLE_ENFORCE_NOT_NULL(ins[0],
-                            platform::errors::NotFound(
-                                "The first input tensor is not initalized."));
-    auto axis = ctx.Attr<int>("axis");
-    auto ins_size = ins.size();
-    bool need_resize_out_dims = false;
-    if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
-      axis = phi::GetVectorFromTensor<int>(axis_tensor)[0];
-      need_resize_out_dims = true;
-    }
-    axis = ComputeAxis(static_cast<int64_t>(axis),
-                       static_cast<int64_t>(ins[0]->dims().size()));
-    if (need_resize_out_dims) {
-      const size_t n = ins.size();
-      std::vector<framework::DDim> ins_dims(n);
-      for (size_t i = 0; i < n; i++) {
-        ins_dims[i] = ins[i]->dims();
-      }
-      framework::DDim out_dims =
-          phi::funcs::ComputeAndCheckShape(true, ins_dims, axis);
-      out->Resize(out_dims);
-    }
-    const int axis_t = axis;
-    const int ins_size_t = ins_size;
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-    // mlu should do sth
-    // init ins tensors
-    std::vector<const void*> inputs;
-    std::vector<MLUCnnlTensorDesc> input_descs;
-    std::vector<cnnlTensorDescriptor_t> desc_vector;
-    for (size_t i = 0; i < ins_size; i++) {
-      input_descs.emplace_back(MLUCnnlTensorDesc(
-          *ins[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(ins[i]->dtype())));
-      desc_vector.push_back(input_descs.back().get());
-      inputs.push_back(GetBasePtr(ins[i]));
-    }
-    // init out tensors
-    MLUCnnlTensorDesc output_desc(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-    // MLU should do sth
-    MLUCnnl::Concat(ctx,
-                    ins_size_t,
-                    axis_t,
-                    desc_vector.data(),
-                    inputs.data(),
-                    output_desc.get(),
-                    GetBasePtr(out));
-  }
-};
-template <typename T>
-class ConcatGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
-    auto outs = ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
-    auto axis = ctx.Attr<int>("axis");
-    int split_num = ins.size();
-    PADDLE_ENFORCE_NOT_NULL(ins[0],
-                            platform::errors::NotFound(
-                                "The first input tensor is not initalized."));
-    if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
-      axis = phi::GetVectorFromTensor<int>(axis_tensor)[0];
-    }
-    axis = ComputeAxis(static_cast<int64_t>(axis),
-                       static_cast<int64_t>(ins[0]->dims().size()));
-    PADDLE_ENFORCE_GE(axis,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "concat_grad: axis should be larger than or "
-                          "equal to 0, but received axis is %d.",
-                          axis));
-    PADDLE_ENFORCE_LT(
-        axis,
-        out_grad->dims().size(),
-        platform::errors::InvalidArgument(
-            "concat_grad: axis should be less than ins[0]->dims()!"
-            "But received axis is %d, while ins[0]->dims()"
-            "size is %d.",
-            axis,
-            out_grad->dims().size()));
-    // get output tensor that the name is not kEmptyVarName
-    std::vector<void*> outputs_vec;
-    std::vector<phi::DenseTensor> tmp_outputs_vec;
-    std::vector<MLUCnnlTensorDesc> output_descs;
-    std::vector<cnnlTensorDescriptor_t> descs_vec;
-    for (size_t j = 0; j < outs.size(); ++j) {
-      if (out_var_names[j] != framework::kEmptyVarName &&
-          outs[j]->numel() != 0UL) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        output_descs.emplace_back(MLUCnnlTensorDesc(*outs[j]));
-        outputs_vec.push_back(GetBasePtr(outs[j]));
-      } else {
-        phi::DenseTensor tmp_tensor;
-        tmp_tensor.mutable_data<T>(ins[j]->dims(), ctx.GetPlace());
-        tmp_outputs_vec.push_back(tmp_tensor);
-        output_descs.emplace_back(MLUCnnlTensorDesc(*ins[j]));
-        outputs_vec.push_back(GetBasePtr(&(tmp_outputs_vec.back())));
-      }
-      descs_vec.push_back(output_descs.back().get());
-    }
-    MLUCnnlTensorDesc out_grad_desc(*out_grad);
-    MLUCnnl::Split(ctx,
-                   static_cast<int>(split_num),
-                   static_cast<int>(axis),
-                   out_grad_desc.get(),
-                   GetBasePtr(out_grad),
-                   descs_vec.data(),
-                   outputs_vec.data());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(concat,
-                       ops::ConcatMLUKernel<float>,
-                       ops::ConcatMLUKernel<paddle::platform::float16>,
-                       ops::ConcatMLUKernel<int64_t>,
-                       ops::ConcatMLUKernel<bool>,
-                       ops::ConcatMLUKernel<int>,
-                       ops::ConcatMLUKernel<uint8_t>);
-REGISTER_OP_MLU_KERNEL(concat_grad,
-                       ops::ConcatGradMLUKernel<float>,
-                       ops::ConcatGradMLUKernel<paddle::platform::float16>,
-                       ops::ConcatGradMLUKernel<int64_t>,
-                       ops::ConcatGradMLUKernel<bool>,
-                       ops::ConcatGradMLUKernel<int>,
-                       ops::ConcatGradMLUKernel<uint8_t>);
--- a/paddle/fluid/operators/conv_op_mlu.cc
+++ b/paddle/fluid/operators/conv_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-using DataLayout = phi::DataLayout;
-template <typename T>
-class MLUConvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = data_format == "NHWC";
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    auto in_dims_size = in_dims.size();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-    phi::DenseTensor input_tensor(input->type());
-    phi::DenseTensor output_tensor(output->type());
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    if (channel_last) {
-      input_tensor.ShareDataWith(*input);
-      output_tensor.ShareDataWith(*output);
-    } else {
-      // transpose input from NCHW to NHWC
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                input,
-                                &input_tensor,
-                                true /*need_reshape_or_alloc*/);
-      auto output_dims = output->dims();
-      output_tensor.mutable_data<T>(
-          {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
-          ctx.GetPlace());
-    }
-    input_tensor.set_layout(DataLayout::kNHWC);
-    output_tensor.set_layout(DataLayout::kNHWC);
-    // transpose filter from MCHW to MHWC
-    phi::DenseTensor trans_filter(filter->type());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              filter,
-                              &trans_filter,
-                              true /*need_reshape_or_alloc*/);
-    cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
-    MLUCnnlTensorDesc input_desc(
-        input_tensor, data_layout, ToCnnlDataType(input_tensor.dtype()));
-    MLUCnnlTensorDesc filter_desc(
-        trans_filter, data_layout, ToCnnlDataType(trans_filter.type()));
-    MLUCnnlTensorDesc output_desc(
-        output_tensor, data_layout, ToCnnlDataType(output_tensor.dtype()));
-    MLUCnnlConvolutionDesc conv_desc(in_dims_size,
-                                     paddings.data(),
-                                     strides.data(),
-                                     dilations.data(),
-                                     groups,
-                                     ToCnnlDataType<T>());
-    MLUCnnl::ConvolutionForward(ctx,
-                                conv_desc.get(),
-                                nullptr /*alpha*/,
-                                nullptr /*beta*/,
-                                nullptr /*bias_desc*/,
-                                nullptr /*bias_ptr*/,
-                                input_desc.get(),
-                                GetBasePtr(&input_tensor),
-                                filter_desc.get(),
-                                GetBasePtr(&trans_filter),
-                                output_desc.get(),
-                                GetBasePtr(&output_tensor));
-    if (!channel_last) {
-      // transpose output from NHWC to NCHW
-      const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &output_tensor,
-                                output,
-                                false /*need_reshape_or_alloc*/);
-    }
-  }
-};
-template <typename T>
-class MLUConvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<phi::DenseTensor>("Input");
-    auto filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    auto input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = data_format == "NHWC";
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    auto in_dims_size = in_dims.size();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-    phi::DenseTensor input_tensor(input->type());
-    phi::DenseTensor output_grad_tensor(output_grad->type());
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-    if (channel_last) {
-      input_tensor.ShareDataWith(*input);
-      output_grad_tensor.ShareDataWith(*output_grad);
-    } else {
-      // transpose input and output_grad from NCHW to NHWC
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                input,
-                                &input_tensor,
-                                true /*need_reshape_or_alloc*/);
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                output_grad,
-                                &output_grad_tensor,
-                                true /*need_reshape_or_alloc*/);
-    }
-    input_tensor.set_layout(DataLayout::kNHWC);
-    output_grad_tensor.set_layout(DataLayout::kNHWC);
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      auto filter_grad_dims = filter_grad->dims();
-      phi::DenseTensor temp_filter_grad(filter_grad->type());
-      temp_filter_grad.mutable_data<T>({filter_grad_dims[0],
-                                        filter_grad_dims[2],
-                                        filter_grad_dims[3],
-                                        filter_grad_dims[1]},
-                                       ctx.GetPlace());
-      cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
-      cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
-      MLUCnnlTensorDesc input_desc(input_tensor, data_layout, tensor_dtype);
-      MLUCnnlTensorDesc out_grad_desc(
-          output_grad_tensor, data_layout, tensor_dtype);
-      MLUCnnlTensorDesc temp_filter_grad_desc(
-          temp_filter_grad, data_layout, tensor_dtype);
-      MLUCnnlConvolutionDesc conv_desc(in_dims_size,
-                                       paddings.data(),
-                                       strides.data(),
-                                       dilations.data(),
-                                       groups,
-                                       tensor_dtype);
-      MLUCnnl::ConvBackpropFilter(ctx,
-                                  conv_desc.get(),
-                                  input_desc.get(),
-                                  GetBasePtr(&input_tensor),
-                                  out_grad_desc.get(),
-                                  GetBasePtr(&output_grad_tensor),
-                                  temp_filter_grad_desc.get(),
-                                  GetBasePtr(&temp_filter_grad));
-      // transpose filter_grad from MHWC to MCHW
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &temp_filter_grad,
-                                filter_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor input_grad_tensor(input_grad->type());
-      if (channel_last) {
-        input_grad_tensor.ShareDataWith(*input_grad);
-      } else {
-        auto input_grad_dims = input_grad->dims();
-        input_grad_tensor.mutable_data<T>({input_grad_dims[0],
-                                           input_grad_dims[2],
-                                           input_grad_dims[3],
-                                           input_grad_dims[1]},
-                                          ctx.GetPlace());
-      }
-      input_grad_tensor.set_layout(DataLayout::kNHWC);
-      // transpose filter from MCHW to MHWC
-      phi::DenseTensor trans_filter(filter->type());
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                filter,
-                                &trans_filter,
-                                true /*need_reshape_or_alloc*/);
-      cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
-      cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
-      MLUCnnlTensorDesc filter_desc(trans_filter, data_layout, tensor_dtype);
-      MLUCnnlTensorDesc out_grad_desc(
-          output_grad_tensor, data_layout, tensor_dtype);
-      MLUCnnlTensorDesc in_grad_desc(
-          input_grad_tensor, data_layout, tensor_dtype);
-      MLUCnnlConvolutionDesc conv_desc(in_dims_size,
-                                       paddings.data(),
-                                       strides.data(),
-                                       dilations.data(),
-                                       groups,
-                                       tensor_dtype);
-      MLUCnnl::ConvBackpropInput(ctx,
-                                 conv_desc.get(),
-                                 filter_desc.get(),
-                                 GetBasePtr(&trans_filter),
-                                 out_grad_desc.get(),
-                                 GetBasePtr(&output_grad_tensor),
-                                 in_grad_desc.get(),
-                                 GetBasePtr(&input_grad_tensor));
-      if (!channel_last) {
-        // transpose input_grad from NHWC to NCHW
-        TransposeFromMLUTensor<T>(ctx,
-                                  perm_to_nchw,
-                                  &input_grad_tensor,
-                                  input_grad,
-                                  false /*need_reshape_or_alloc*/);
-      }
-    }
-  }
-};
-template <typename T>
-class MLUDepthwiseConvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = data_format == "NHWC";
-    int groups;
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    auto in_dims_size = in_dims.size();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-    phi::DenseTensor input_tensor(input->type());
-    phi::DenseTensor output_tensor(output->type());
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    if (channel_last) {
-      groups = in_dims[3];
-      input_tensor.ShareDataWith(*input);
-      output_tensor.ShareDataWith(*output);
-    } else {
-      // transpose input from NCHW to NHWC
-      groups = in_dims[1];
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                input,
-                                &input_tensor,
-                                true /*need_reshape_or_alloc*/);
-      auto output_dims = output->dims();
-      output_tensor.mutable_data<T>(
-          {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
-          ctx.GetPlace());
-    }
-    input_tensor.set_layout(DataLayout::kNHWC);
-    output_tensor.set_layout(DataLayout::kNHWC);
-    // transpose filter from MCHW to MHWC
-    phi::DenseTensor trans_filter(filter->type());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              filter,
-                              &trans_filter,
-                              true /*need_reshape_or_alloc*/);
-    cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
-    MLUCnnlTensorDesc input_desc(
-        input_tensor, data_layout, ToCnnlDataType(input_tensor.dtype()));
-    MLUCnnlTensorDesc filter_desc(
-        trans_filter, data_layout, ToCnnlDataType(trans_filter.type()));
-    MLUCnnlTensorDesc output_desc(
-        output_tensor, data_layout, ToCnnlDataType(output_tensor.dtype()));
-    MLUCnnlConvolutionDesc conv_desc(in_dims_size,
-                                     paddings.data(),
-                                     strides.data(),
-                                     dilations.data(),
-                                     groups,
-                                     ToCnnlDataType<T>());
-    MLUCnnl::ConvolutionForward(ctx,
-                                conv_desc.get(),
-                                nullptr /*alpha*/,
-                                nullptr /*beta*/,
-                                nullptr /*bias_desc*/,
-                                nullptr /*bias_ptr*/,
-                                input_desc.get(),
-                                GetBasePtr(&input_tensor),
-                                filter_desc.get(),
-                                GetBasePtr(&trans_filter),
-                                output_desc.get(),
-                                GetBasePtr(&output_tensor));
-    if (!channel_last) {
-      // transpose output from NHWC to NCHW
-      const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &output_tensor,
-                                output,
-                                false /*need_reshape_or_alloc*/);
-    }
-  }
-};
-template <typename T>
-class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<phi::DenseTensor>("Input");
-    auto filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    auto input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = data_format == "NHWC";
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    auto in_dims_size = in_dims.size();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-    int groups;
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-    phi::DenseTensor input_tensor(input->type());
-    phi::DenseTensor output_grad_tensor(output_grad->type());
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-    const std::vector<int> perm_hwcm_to_mchw = {3, 2, 0, 1};
-    const std::vector<int> perm_mchw_to_hwcm = {2, 3, 1, 0};
-    if (channel_last) {
-      input_tensor.ShareDataWith(*input);
-      output_grad_tensor.ShareDataWith(*output_grad);
-      groups = in_dims[3];
-    } else {
-      groups = in_dims[1];
-      // transpose input and output_grad from NCHW to NHWC
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                input,
-                                &input_tensor,
-                                true /*need_reshape_or_alloc*/);
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                output_grad,
-                                &output_grad_tensor,
-                                true /*need_reshape_or_alloc*/);
-    }
-    input_tensor.set_layout(DataLayout::kNHWC);
-    output_grad_tensor.set_layout(DataLayout::kNHWC);
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      auto filter_grad_dims = filter_grad->dims();
-      phi::DenseTensor temp_filter_grad(filter_grad->type());
-      // Details about setting diff_w hwcn for better performance, see the CNNL
-      // documentation.
-      temp_filter_grad.mutable_data<T>({filter_grad_dims[perm_mchw_to_hwcm[0]],
-                                        filter_grad_dims[perm_mchw_to_hwcm[1]],
-                                        filter_grad_dims[perm_mchw_to_hwcm[2]],
-                                        filter_grad_dims[perm_mchw_to_hwcm[3]]},
-                                       ctx.GetPlace());
-      cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
-      cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
-      MLUCnnlTensorDesc input_desc(input_tensor, data_layout, tensor_dtype);
-      MLUCnnlTensorDesc out_grad_desc(
-          output_grad_tensor, data_layout, tensor_dtype);
-      MLUCnnlTensorDesc temp_filter_grad_desc(
-          temp_filter_grad, CNNL_LAYOUT_HWCN, tensor_dtype);
-      MLUCnnlConvolutionDesc conv_desc(in_dims_size,
-                                       paddings.data(),
-                                       strides.data(),
-                                       dilations.data(),
-                                       groups,
-                                       tensor_dtype);
-      MLUCnnl::ConvBackpropFilter(ctx,
-                                  conv_desc.get(),
-                                  input_desc.get(),
-                                  GetBasePtr(&input_tensor),
-                                  out_grad_desc.get(),
-                                  GetBasePtr(&output_grad_tensor),
-                                  temp_filter_grad_desc.get(),
-                                  GetBasePtr(&temp_filter_grad));
-      // transpose filter_grad from HWCM to MCHW
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_hwcm_to_mchw,
-                                &temp_filter_grad,
-                                filter_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor input_grad_tensor(input_grad->type());
-      if (channel_last) {
-        input_grad_tensor.ShareDataWith(*input_grad);
-      } else {
-        auto input_grad_dims = input_grad->dims();
-        input_grad_tensor.mutable_data<T>({input_grad_dims[0],
-                                           input_grad_dims[2],
-                                           input_grad_dims[3],
-                                           input_grad_dims[1]},
-                                          ctx.GetPlace());
-      }
-      input_grad_tensor.set_layout(DataLayout::kNHWC);
-      // transpose filter from MCHW to MHWC
-      phi::DenseTensor trans_filter(filter->type());
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                filter,
-                                &trans_filter,
-                                true /*need_reshape_or_alloc*/);
-      cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
-      cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
-      MLUCnnlTensorDesc filter_desc(trans_filter, data_layout, tensor_dtype);
-      MLUCnnlTensorDesc out_grad_desc(
-          output_grad_tensor, data_layout, tensor_dtype);
-      MLUCnnlTensorDesc in_grad_desc(
-          input_grad_tensor, data_layout, tensor_dtype);
-      MLUCnnlConvolutionDesc conv_desc(in_dims_size,
-                                       paddings.data(),
-                                       strides.data(),
-                                       dilations.data(),
-                                       groups,
-                                       tensor_dtype);
-      MLUCnnl::ConvBackpropInput(ctx,
-                                 conv_desc.get(),
-                                 filter_desc.get(),
-                                 GetBasePtr(&trans_filter),
-                                 out_grad_desc.get(),
-                                 GetBasePtr(&output_grad_tensor),
-                                 in_grad_desc.get(),
-                                 GetBasePtr(&input_grad_tensor));
-      if (!channel_last) {
-        // transpose input_grad from NHWC to NCHW
-        TransposeFromMLUTensor<T>(ctx,
-                                  perm_to_nchw,
-                                  &input_grad_tensor,
-                                  input_grad,
-                                  false /*need_reshape_or_alloc*/);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(conv2d,
-                       ops::MLUConvOpKernel<float>,
-                       ops::MLUConvOpKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(conv2d_grad,
-                       ops::MLUConvGradOpKernel<float>,
-                       ops::MLUConvGradOpKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(depthwise_conv2d,
-                       ops::MLUDepthwiseConvOpKernel<float>,
-                       ops::MLUDepthwiseConvOpKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(depthwise_conv2d_grad,
-                       ops::MLUDepthwiseConvGradOpKernel<float>,
-                       ops::MLUDepthwiseConvGradOpKernel<plat::float16>);
--- a/paddle/fluid/operators/conv_transpose_op_mlu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/phi/kernels/cpu/conv_util.h"
-namespace paddle {
-namespace operators {
-using DataLayout = phi::DataLayout;
-template <typename T>
-class Conv2DTransposeMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> output_padding =
-        ctx.Attr<std::vector<int>>("output_padding");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    // check dimension
-    const bool channel_last = data_format == "NHWC";
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    auto in_dims_size = in_dims.size();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-    phi::DenseTensor input_tensor(input->type());
-    phi::DenseTensor output_tensor(output->type());
-    input_tensor.set_layout(DataLayout::kNHWC);
-    output_tensor.set_layout(DataLayout::kNHWC);
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    if (channel_last) {
-      input_tensor.ShareDataWith(*input);
-      output_tensor.ShareDataWith(*output);
-    } else {
-      // transpose input from NCHW to NHWC
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                input,
-                                &input_tensor,
-                                true /*need_reshape_or_alloc*/);
-      auto output_dims = output->dims();
-      output_tensor.mutable_data<T>(
-          {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
-          ctx.GetPlace());
-    }
-    // transpose filter from MCHW to MHWC
-    phi::DenseTensor trans_filter(filter->type());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              filter,
-                              &trans_filter,
-                              true /*need_reshape_or_alloc*/);
-    // construct MLU attr
-    cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
-    MLUCnnlTensorDesc input_desc(
-        input_tensor, data_layout, ToCnnlDataType(input_tensor.dtype()));
-    MLUCnnlTensorDesc filter_desc(
-        trans_filter, data_layout, ToCnnlDataType(trans_filter.type()));
-    MLUCnnlTensorDesc output_desc(
-        output_tensor, data_layout, ToCnnlDataType(output_tensor.dtype()));
-    MLUCnnlConvolutionDesc conv_desc(in_dims_size,
-                                     paddings.data(),
-                                     strides.data(),
-                                     dilations.data(),
-                                     groups,
-                                     ToCnnlDataType<T>());
-    MLUCnnl::ConvBackpropInput(ctx,
-                               conv_desc.get(),
-                               filter_desc.get(),
-                               GetBasePtr(&trans_filter),
-                               input_desc.get(),
-                               GetBasePtr(&input_tensor),
-                               output_desc.get(),
-                               GetBasePtr(&output_tensor));
-    if (!channel_last) {
-      // transpose output from NHWC to NCHW
-      const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &output_tensor,
-                                output,
-                                false /*need_reshape_or_alloc*/);
-    }
-  }
-};
-template <typename T>
-class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    const phi::DenseTensor* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    phi::DenseTensor* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    phi::DenseTensor* filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-    if ((!input_grad) && (!filter_grad)) return;
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const int groups = ctx.Attr<int>("groups");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const phi::DataLayout data_layout = phi::StringToDataLayout(data_format);
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    auto in_dims_size = in_dims.size();
-    const bool channel_last = (data_layout == phi::DataLayout::kNHWC);
-    framework::DDim in_data_dims;
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-    phi::DenseTensor input_tensor(input->type());
-    phi::DenseTensor output_grad_tensor(output_grad->type());
-    output_grad_tensor.set_layout(DataLayout::kNHWC);
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    if (channel_last) {
-      input_tensor.ShareDataWith(*input);
-      output_grad_tensor.ShareDataWith(*output_grad);
-    } else {
-      // transpose input from NCHW to NHWC
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                input,
-                                &input_tensor,
-                                true /*need_reshape_or_alloc*/);
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                output_grad,
-                                &output_grad_tensor,
-                                true /*need_reshape_or_alloc*/);
-    }
-    // transpose filter from MCHW to MHWC
-    phi::DenseTensor trans_filter(filter->type());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              filter,
-                              &trans_filter,
-                              true /*need_reshape_or_alloc*/);
-    // MLU descs
-    cnnlTensorLayout_t data_layout_mlu = CNNL_LAYOUT_NHWC;
-    MLUCnnlTensorDesc input_desc(
-        input_tensor, data_layout_mlu, ToCnnlDataType(input_tensor.dtype()));
-    MLUCnnlTensorDesc trans_filter_desc(
-        trans_filter, data_layout_mlu, ToCnnlDataType(trans_filter.type()));
-    MLUCnnlTensorDesc output_grad_desc(
-        output_grad_tensor,
-        data_layout_mlu,
-        ToCnnlDataType(output_grad_tensor.dtype()));
-    MLUCnnlConvolutionDesc conv_desc(in_dims_size,
-                                     paddings.data(),
-                                     strides.data(),
-                                     dilations.data(),
-                                     groups,
-                                     ToCnnlDataType<T>());
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor filter_grad_tensor(filter_grad->type());
-      // filter_grad always MCHW
-      // filter_grad_tensor always MHWC
-      auto filter_grad_dims = filter_grad->dims();
-      filter_grad_tensor.mutable_data<T>({filter_grad_dims[0],
-                                          filter_grad_dims[2],
-                                          filter_grad_dims[3],
-                                          filter_grad_dims[1]},
-                                         ctx.GetPlace());
-      //}
-      filter_grad_tensor.set_layout(DataLayout::kNHWC);
-      MLUCnnlTensorDesc filter_grad_desc(
-          filter_grad_tensor,
-          data_layout_mlu,
-          ToCnnlDataType(filter_grad_tensor.dtype()));
-      MLUCnnl::ConvBackpropFilter(ctx,
-                                  conv_desc.get(),
-                                  output_grad_desc.get(),
-                                  GetBasePtr(output_grad),
-                                  input_desc.get(),
-                                  GetBasePtr(&input_tensor),
-                                  filter_grad_desc.get(),
-                                  GetBasePtr(&filter_grad_tensor));
-      // transpose output from MHWC to MCHW
-      const std::vector<int> perm_to_mchw = {0, 3, 1, 2};
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_mchw,
-                                &filter_grad_tensor,
-                                filter_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor input_grad_tensor(input_grad->type());
-      input_tensor.set_layout(DataLayout::kNHWC);
-      if (channel_last) {
-        input_grad_tensor.ShareDataWith(*input_grad);
-      } else {
-        auto input_grad_dims = input_grad->dims();
-        input_grad_tensor.mutable_data<T>({input_grad_dims[0],
-                                           input_grad_dims[2],
-                                           input_grad_dims[3],
-                                           input_grad_dims[1]},
-                                          ctx.GetPlace());
-      }
-      MLUCnnlTensorDesc input_grad_desc(
-          input_grad_tensor,
-          data_layout_mlu,
-          ToCnnlDataType(input_grad_tensor.dtype()));
-      MLUCnnl::ConvolutionForward(ctx,
-                                  conv_desc.get(),
-                                  nullptr /*alpha*/,
-                                  nullptr /*beta*/,
-                                  nullptr /*bias_desc*/,
-                                  nullptr /*bias_ptr*/,
-                                  output_grad_desc.get(),
-                                  GetBasePtr(&output_grad_tensor),
-                                  trans_filter_desc.get(),
-                                  GetBasePtr(&trans_filter),
-                                  input_grad_desc.get(),
-                                  GetBasePtr(&input_grad_tensor));
-      if (!channel_last) {
-        // transpose output from NHWC to NCHW
-        const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-        TransposeFromMLUTensor<T>(ctx,
-                                  perm_to_nchw,
-                                  &input_grad_tensor,
-                                  input_grad,
-                                  false /*need_reshape_or_alloc*/);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(conv2d_transpose,
-                       ops::Conv2DTransposeMLUKernel<float>,
-                       ops::Conv2DTransposeMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(conv2d_transpose_grad,
-                       ops::Conv2DTransposeGradMLUKernel<float>,
-                       ops::Conv2DTransposeGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/cumsum_op_mlu.cc
+++ b/paddle/fluid/operators/cumsum_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class CumSumMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int axis = ctx.Attr<int>("axis");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool reverse = ctx.Attr<bool>("reverse");
-    bool flatten = ctx.Attr<bool>("flatten");
-    out->mutable_data<T>(ctx.GetPlace());
-    phi::DenseTensor* input_ptr = const_cast<phi::DenseTensor*>(x);
-    phi::DenseTensor flat_x(x->type());
-    if (flatten) {
-      PADDLE_ENFORCE_EQ(
-          axis,
-          -1,
-          platform::errors::InvalidArgument(
-              "when flatten is true, attr axis must be default %d, but got %d",
-              -1,
-              axis));
-      flat_x.ShareDataWith(*x);
-      flat_x.Resize(phi::make_ddim({x->numel()}));
-      input_ptr = &flat_x;
-    }
-    const int true_axis = (axis < 0) ? input_ptr->dims().size() + axis : axis;
-    MLUCnnlTensorDesc input_desc(*input_ptr);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::Cumsum(ctx,
-                    true_axis,
-                    exclusive,
-                    reverse,
-                    input_desc.get(),
-                    GetBasePtr(input_ptr),
-                    out_desc.get(),
-                    GetBasePtr(out));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(cumsum,
-                       ops::CumSumMLUKernel<int>,
-                       ops::CumSumMLUKernel<float>,
-                       ops::CumSumMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/deformable_conv_op_mlu.cc
+++ b/paddle/fluid/operators/deformable_conv_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class DeformableConvMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* offset = ctx.Input<phi::DenseTensor>("Offset");
-    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
-    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    const int groups = ctx.Attr<int>("groups");
-    const int deformable_groups = ctx.Attr<int>("deformable_groups");
-    const int im2col_step = ctx.Attr<int>("im2col_step");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    // TODO(fwg): Remove this check when cnnl fix the bug that groups > 1.
-    PADDLE_ENFORCE_EQ(
-        groups == 1,
-        true,
-        platform::errors::InvalidArgument(
-            "MLU deformable_conv kernel only support groups == 1, but get %d.",
-            groups));
-    // transform paddings from {h, w} to {top, bottom, left, right}.
-    const std::vector<int> trans_paddings{
-        paddings[0], paddings[0], paddings[1], paddings[1]};
-    MLUCnnlDCNDesc dcn_desc(input->dims().size(),
-                            trans_paddings.data(),
-                            strides.data(),
-                            dilations.data(),
-                            deformable_groups,
-                            groups,
-                            im2col_step);
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    phi::DenseTensor trans_input(input->dtype());
-    TransposeFromMLUTensor<T>(
-        ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
-    phi::DenseTensor trans_offset(offset->dtype());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              offset,
-                              &trans_offset,
-                              true /*need_reshape_or_alloc*/);
-    phi::DenseTensor trans_mask(mask->dtype());
-    TransposeFromMLUTensor<T>(
-        ctx, perm_to_nhwc, mask, &trans_mask, true /*need_reshape_or_alloc*/);
-    phi::DenseTensor trans_filter(filter->dtype());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              filter,
-                              &trans_filter,
-                              true /*need_reshape_or_alloc*/);
-    phi::DenseTensor tmp_output(output->dtype());
-    auto output_dims = output->dims();
-    tmp_output.mutable_data<T>(
-        {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
-        ctx.GetPlace());
-    cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
-    MLUCnnlTensorDesc input_desc(
-        trans_input, data_layout, ToCnnlDataType(trans_input.dtype()));
-    MLUCnnlTensorDesc offset_desc(
-        trans_offset, data_layout, ToCnnlDataType(trans_offset.dtype()));
-    MLUCnnlTensorDesc mask_desc(
-        trans_mask, data_layout, ToCnnlDataType(trans_mask.dtype()));
-    MLUCnnlTensorDesc filter_desc(
-        trans_filter, data_layout, ToCnnlDataType(trans_filter.dtype()));
-    MLUCnnlTensorDesc output_desc(
-        tmp_output, data_layout, ToCnnlDataType(tmp_output.dtype()));
-    MLUCnnl::DCNForward(ctx,
-                        dcn_desc.get(),
-                        input_desc.get(),
-                        GetBasePtr(&trans_input),
-                        offset_desc.get(),
-                        GetBasePtr(&trans_offset),
-                        mask_desc.get(),
-                        GetBasePtr(&trans_mask),
-                        filter_desc.get(),
-                        GetBasePtr(&trans_filter),
-                        nullptr,
-                        nullptr,
-                        output_desc.get(),
-                        GetBasePtr(&tmp_output));
-    const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nchw,
-                              &tmp_output,
-                              output,
-                              false /*need_reshape_or_alloc*/);
-  }
-};
-template <typename T>
-class DeformableConvGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto* filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-    auto* offset_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Offset"));
-    auto* mask_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Mask"));
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* offset = ctx.Input<phi::DenseTensor>("Offset");
-    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
-    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
-    int groups = ctx.Attr<int>("groups");
-    int deformable_groups = ctx.Attr<int>("deformable_groups");
-    int im2col_step = ctx.Attr<int>("im2col_step");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    // TODO(fwg): Remove this check when cnnl fix the bug that groups > 1.
-    PADDLE_ENFORCE_EQ(groups == 1,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "MLU deformable_conv_grad kernel only support groups "
-                          "== 1, but get %d.",
-                          groups));
-    // transform paddings from {h, w} to {top, bottom, left, right}.
-    const std::vector<int> trans_paddings{
-        paddings[0], paddings[0], paddings[1], paddings[1]};
-    MLUCnnlDCNDesc dcn_desc(input->dims().size(),
-                            trans_paddings.data(),
-                            strides.data(),
-                            dilations.data(),
-                            deformable_groups,
-                            groups,
-                            im2col_step);
-    phi::DenseTensor tmp_input_grad;
-    auto input_dims = input->dims();
-    tmp_input_grad.mutable_data<T>(
-        {input_dims[0], input_dims[2], input_dims[3], input_dims[1]},
-        ctx.GetPlace());
-    phi::DenseTensor tmp_filter_grad;
-    auto filter_dims = filter->dims();
-    tmp_filter_grad.mutable_data<T>(
-        {filter_dims[0], filter_dims[2], filter_dims[3], filter_dims[1]},
-        ctx.GetPlace());
-    phi::DenseTensor tmp_offset_grad;
-    auto offset_dims = offset->dims();
-    tmp_offset_grad.mutable_data<T>(
-        {offset_dims[0], offset_dims[2], offset_dims[3], offset_dims[1]},
-        ctx.GetPlace());
-    phi::DenseTensor tmp_mask_grad;
-    auto mask_dims = mask->dims();
-    tmp_mask_grad.mutable_data<T>(
-        {mask_dims[0], mask_dims[2], mask_dims[3], mask_dims[1]},
-        ctx.GetPlace());
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    phi::DenseTensor trans_output_grad(output_grad->dtype());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              output_grad,
-                              &trans_output_grad,
-                              true /*need_reshape_or_alloc*/);
-    phi::DenseTensor trans_input(input->dtype());
-    TransposeFromMLUTensor<T>(
-        ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
-    phi::DenseTensor trans_offset(offset->dtype());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              offset,
-                              &trans_offset,
-                              true /*need_reshape_or_alloc*/);
-    phi::DenseTensor trans_mask(mask->dtype());
-    TransposeFromMLUTensor<T>(
-        ctx, perm_to_nhwc, mask, &trans_mask, true /*need_reshape_or_alloc*/);
-    phi::DenseTensor trans_filter(filter->dtype());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              filter,
-                              &trans_filter,
-                              true /*need_reshape_or_alloc*/);
-    cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
-    MLUCnnlTensorDesc output_grad_desc(
-        trans_output_grad,
-        data_layout,
-        ToCnnlDataType(trans_output_grad.dtype()));
-    MLUCnnlTensorDesc input_desc(
-        trans_input, data_layout, ToCnnlDataType(trans_input.dtype()));
-    MLUCnnlTensorDesc offset_desc(
-        trans_offset, data_layout, ToCnnlDataType(trans_offset.dtype()));
-    MLUCnnlTensorDesc mask_desc(
-        trans_mask, data_layout, ToCnnlDataType(trans_mask.dtype()));
-    MLUCnnlTensorDesc filter_desc(
-        trans_filter, data_layout, ToCnnlDataType(trans_filter.dtype()));
-    MLUCnnl::DCNBackwardData(ctx,
-                             dcn_desc.get(),
-                             input_desc.get(),
-                             GetBasePtr(&trans_input),
-                             offset_desc.get(),
-                             GetBasePtr(&trans_offset),
-                             mask_desc.get(),
-                             GetBasePtr(&trans_mask),
-                             filter_desc.get(),
-                             GetBasePtr(&trans_filter),
-                             output_grad_desc.get(),
-                             GetBasePtr(&trans_output_grad),
-                             input_desc.get(),
-                             GetBasePtr(&tmp_input_grad),
-                             offset_desc.get(),
-                             GetBasePtr(&tmp_offset_grad),
-                             mask_desc.get(),
-                             GetBasePtr(&tmp_mask_grad));
-    MLUCnnl::DCNBackwardWeight(ctx,
-                               dcn_desc.get(),
-                               input_desc.get(),
-                               GetBasePtr(&trans_input),
-                               offset_desc.get(),
-                               GetBasePtr(&trans_offset),
-                               mask_desc.get(),
-                               GetBasePtr(&trans_mask),
-                               output_grad_desc.get(),
-                               GetBasePtr(&trans_output_grad),
-                               filter_desc.get(),
-                               GetBasePtr(&tmp_filter_grad),
-                               nullptr,
-                               nullptr);
-    const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &tmp_input_grad,
-                                input_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &tmp_filter_grad,
-                                filter_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-    if (offset_grad) {
-      offset_grad->mutable_data<T>(ctx.GetPlace());
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &tmp_offset_grad,
-                                offset_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-    if (mask_grad) {
-      mask_grad->mutable_data<T>(ctx.GetPlace());
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &tmp_mask_grad,
-                                mask_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(deformable_conv, ops::DeformableConvMLUKernel<float>);
-REGISTER_OP_MLU_KERNEL(deformable_conv_grad,
-                       ops::DeformableConvGradMLUKernel<float>);
--- a/paddle/fluid/operators/dropout_op_mlu.cc
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class DropoutMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto dropout_prob = ctx.Attr<float>("dropout_prob");
-    auto is_test = ctx.Attr<bool>("is_test");
-    auto* seed_tensor =
-        ctx.HasInput("Seed") ? ctx.Input<phi::DenseTensor>("Seed") : nullptr;
-    auto dropout_implementation =
-        ctx.Attr<std::string>("dropout_implementation");
-    const bool is_upscale = (dropout_implementation == "upscale_in_train");
-    out->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc out_desc(*out);
-    if (is_test && is_upscale) {
-      // dropout op for inference: out = input.
-      framework::TensorCopy(
-          *x,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::MLUDeviceContext>(),
-          out);
-      return;
-    } else if (!is_test) {
-      // dropout op for training: out = input * mask / ( 1.0 - dropout_prob ) or
-      // out = input * mask.
-      int seed_data = 0;
-      if (seed_tensor) {
-        if (platform::is_mlu_place(seed_tensor->place())) {
-          memory::Copy(platform::CPUPlace(),
-                       &seed_data,
-                       seed_tensor->place(),
-                       seed_tensor->data<int>(),
-                       sizeof(int));
-        } else {
-          seed_data = *(seed_tensor->data<int>());
-        }
-      } else {
-        seed_data = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
-      }
-      auto* mask = ctx.Output<phi::DenseTensor>("Mask");
-      mask->mutable_data<uint8_t>(ctx.GetPlace());
-      MLUCnnlTensorDesc mask_desc(*mask);
-      // Special case when dropout_prob is 1.0
-      if (dropout_prob == 1.0f) {
-        auto value_t = static_cast<T>(0.0f);
-        MLUCnnl::Fill(ctx,
-                      CNNL_POINTER_MODE_HOST,
-                      &value_t,
-                      out_desc.get(),
-                      GetBasePtr(out));
-        MLUCnnl::Fill(ctx,
-                      CNNL_POINTER_MODE_HOST,
-                      &value_t,
-                      mask_desc.get(),
-                      GetBasePtr(mask));
-        return;
-      }
-      // create mlu random generator
-      const int device_id = ctx.GetPlace().GetDeviceId();
-      auto mlu_gen_random = GetMLURandomGenerator(ctx, device_id, seed_data);
-      // compute out = input * mask / ( 1.0 - dropout_prob )
-      MLUCnnl::FusedDropout(ctx,
-                            mlu_gen_random->get(),
-                            x_desc.get(),
-                            GetBasePtr(x),
-                            dropout_prob,
-                            GetBasePtr(&(mlu_gen_random->get_state())),
-                            mask_desc.get(),
-                            GetBasePtr(mask),
-                            out_desc.get(),
-                            GetBasePtr(out));
-      if (is_upscale) {
-        return;
-      }
-    }
-    // In downgrade_in_infer mode, need to multiply (1.0f - dropout_prob).
-    phi::DenseTensor scale_tensor(x->dtype());
-    phi::DenseTensor bias_tensor(x->dtype());
-    scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
-    bias_tensor.mutable_data<T>({1}, ctx.GetPlace());
-    MLUCnnlTensorDesc scale_desc(scale_tensor);
-    MLUCnnlTensorDesc bias_desc(bias_tensor);
-    FillMLUTensorWithHostValue(
-        ctx, static_cast<T>(1.0f - dropout_prob), &scale_tensor);
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0f), &bias_tensor);
-    MLUCnnl::Scale(ctx,
-                   0,
-                   is_test ? x_desc.get() : out_desc.get(),
-                   is_test ? GetBasePtr(x) : GetBasePtr(out),
-                   scale_desc.get(),
-                   GetBasePtr(&scale_tensor),
-                   bias_desc.get(),
-                   GetBasePtr(&bias_tensor),
-                   out_desc.get(),
-                   GetBasePtr(out));
-  }
-};
-template <typename T>
-class DropoutGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(!ctx.Attr<bool>("is_test"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "GradOp is only callable when is_test is false"));
-    auto* grad_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* grad_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
-    auto dropout_prob = ctx.Attr<float>("dropout_prob");
-    auto dropout_impl = ctx.Attr<std::string>("dropout_implementation");
-    grad_x->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc grad_x_desc(*grad_x);
-    if (dropout_prob == 1.) {
-      auto value_t = static_cast<T>(0.0f);
-      MLUCnnl::Fill(ctx,
-                    CNNL_POINTER_MODE_HOST,
-                    &value_t,
-                    grad_x_desc.get(),
-                    GetBasePtr(grad_x));
-      return;
-    }
-    // cast mask from uint8 to float32/float16
-    phi::DenseTensor cast_mask(grad_x->dtype());
-    cast_mask.Resize(mask->dims());
-    cast_mask.mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc mask_desc(*mask);
-    MLUCnnlTensorDesc cast_mask_desc(cast_mask);
-    cnnlCastDataType_t cast_type =
-        GetCastDataType(framework::TransToProtoVarType(mask->dtype()),
-                        framework::TransToProtoVarType(cast_mask.dtype()));
-    MLUCnnl::Cast(ctx,
-                  cast_type,
-                  mask_desc.get(),
-                  GetBasePtr(mask),
-                  cast_mask_desc.get(),
-                  GetBasePtr(&cast_mask));
-    const bool is_upscale = (dropout_impl == "upscale_in_train");
-    const float scale = is_upscale ? (1.0f / (1.0f - dropout_prob)) : (1.0f);
-    auto data_type = ToCnnlDataType<T>();
-    MLUCnnlTensorDesc grad_out_desc(*grad_out);
-    MLUCnnlOpTensorDesc op_tensor_desc(
-        CNNL_OP_TENSOR_MUL, data_type, CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx,
-                      op_tensor_desc.get(),
-                      cast_mask_desc.get(),
-                      GetBasePtr(&cast_mask),
-                      grad_out_desc.get(),
-                      GetBasePtr(grad_out),
-                      grad_x_desc.get(),
-                      GetBasePtr(grad_x),
-                      data_type,
-                      scale);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(dropout,
-                       ops::DropoutMLUKernel<float>,
-                       ops::DropoutMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(dropout_grad,
-                       ops::DropoutGradMLUKernel<float>,
-                       ops::DropoutGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/expand_as_v2_op_mlu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/expand_as_v2_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ExpandAsV2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto target_rank = target_shape.size();
-    PADDLE_ENFORCE_GE(target_rank,
-                      rank,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be greater than or equal to "
-                          "the rank (%d) of the input 'x'.",
-                          target_rank,
-                          rank));
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument("The rank (%d) of the input 'x' for "
-                                          "expand_as_v2 op must be positive.",
-                                          rank));
-    PADDLE_ENFORCE_LE(target_rank,
-                      MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be less than or equal to %d.",
-                          target_rank,
-                          MAX_RANK_SUPPORTED));
-    ExpandAs(context);
-  }
- protected:
-  void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto in_dims = in0->dims();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(target_shape[i],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The value of target shape cannot be zero."));
-      if (vec_in_dims[i] != 1) {
-        PADDLE_ENFORCE_EQ(
-            vec_in_dims[i],
-            target_shape[i],
-            platform::errors::InvalidArgument(
-                "The value (%d) of the non-singleton dimension does not match"
-                " the corresponding value (%d) in "
-                "target tensor for expand_as_v2 op.",
-                vec_in_dims[i],
-                target_shape[i]));
-      }
-    }
-    auto* out0 = context.Output<phi::DenseTensor>("Out");
-    framework::DDim out_dims = phi::make_ddim(target_shape);
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(context.GetPlace());
-    MLUCnnlTensorDesc x_desc(*in0);
-    MLUCnnlTensorDesc out_desc(*out0);
-    MLUCnnl::BroadcastTo(context,
-                         x_desc.get(),
-                         GetBasePtr(in0),
-                         out_desc.get(),
-                         GetBasePtr(out0));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(expand_as_v2,
-                       ops::ExpandAsV2MLUKernel<float>,
-                       ops::ExpandAsV2MLUKernel<int>,
-                       ops::ExpandAsV2MLUKernel<int64_t>,
-                       ops::ExpandAsV2MLUKernel<int8_t>,
-                       ops::ExpandAsV2MLUKernel<uint8_t>,
-                       ops::ExpandAsV2MLUKernel<bool>,
-                       ops::ExpandAsV2MLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/expand_v2_op_mlu.cc
+++ b/paddle/fluid/operators/expand_v2_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/expand_v2_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ExpandV2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-    auto in_dims = X->dims();
-    auto expand_shape = get_expand_shape(ctx);
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = expand_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> final_expand_shape(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(expand_shape[i],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The expanded size cannot be zero."));
-      if (i < diff) {  // expand_shape = [3,4,-1,-1], X = [10,2] -->
-                       // final_expand_shape = [3,4,10,2]
-        PADDLE_ENFORCE_GT(
-            expand_shape[i],
-            0,
-            platform::errors::InvalidArgument(
-                "The expanded size (%d) for non-existing dimensions must be "
-                "positive for expand_v2 op.",
-                expand_shape[i]));
-        final_expand_shape[i] = expand_shape[i];
-      } else if (expand_shape[i] > 0) {  // expand_shape = [3,4,10,4], X =
-                                         // [10,1] --> final_expand_shape =
-                                         // [3,4,10,4]
-        if (vec_in_dims[i] != 1) {
-          PADDLE_ENFORCE_EQ(
-              vec_in_dims[i],
-              expand_shape[i],
-              platform::errors::InvalidArgument(
-                  "The value (%d) of the non-singleton dimension does not match"
-                  " the corresponding value (%d) in shape for expand_v2 op.",
-                  vec_in_dims[i],
-                  expand_shape[i]));
-          final_expand_shape[i] = expand_shape[i];
-        } else {
-          final_expand_shape[i] = expand_shape[i];
-        }
-      } else {  // expand_shape = [3,4,-1,-1], X = [10,2] --> final_expand_shape
-                // = [3,4,10,2]
-        PADDLE_ENFORCE_EQ(
-            expand_shape[i],
-            -1,
-            platform::errors::InvalidArgument(
-                "When the value in shape is negative for expand_v2 op, "
-                "only -1 is supported, but the value received is %d.",
-                expand_shape[i]));
-        final_expand_shape[i] = vec_in_dims[i];
-      }
-    }
-    auto rank = X->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'X' for expand_v2_mlu op must be positive, "
-            "but the value received is %d.",
-            rank));
-    auto shape_size = final_expand_shape.size();
-    PADDLE_ENFORCE_GE(
-        shape_size,
-        rank,
-        platform::errors::InvalidArgument(
-            "The number (%d) of elements of 'shape' for expand_v2_mlu op must "
-            "be "
-            "greater than or equal to the rank (%d) of the input 'X'.",
-            shape_size,
-            rank));
-    framework::DDim out_dims = phi::make_ddim(final_expand_shape);
-    Out->Resize(out_dims);
-    auto place = ctx.GetPlace();
-    Out->mutable_data<T>(place);
-    MLUCnnlTensorDesc x_desc(*X);
-    MLUCnnlTensorDesc out_desc(*Out);
-    MLUCnnl::BroadcastTo(
-        ctx, x_desc.get(), GetBasePtr(X), out_desc.get(), GetBasePtr(Out));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(expand_v2,
-                       ops::ExpandV2MLUKernel<float>,
-                       ops::ExpandV2MLUKernel<paddle::platform::float16>,
-                       ops::ExpandV2MLUKernel<bool>,
-                       ops::ExpandV2MLUKernel<int>,
-                       ops::ExpandV2MLUKernel<int64_t>);
-#endif
--- a/paddle/fluid/operators/fill_any_like_op_mlu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class FillAnyLikeMLUKernel : public framework::OpKernel<T> {
- public:
-  using CommonType = typename std::common_type<
-      float,
-      typename std::conditional<std::is_same<T, platform::float16>::value,
-                                float,
-                                T>::type>::type;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    float value = ctx.Attr<float>("value");
-    auto common_type_value = static_cast<CommonType>(value);
-    PADDLE_ENFORCE_EQ(
-        (common_type_value >=
-         static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-            (common_type_value <=
-             static_cast<CommonType>(std::numeric_limits<T>::max())),
-        true,
-        platform::errors::InvalidArgument(
-            "The filled value is out of range for target type, "
-            "current kernel type is %s, the range should between %f "
-            "and %f, but now value is %f.",
-            typeid(T).name(),
-            static_cast<CommonType>(std::numeric_limits<T>::lowest()),
-            static_cast<CommonType>(std::numeric_limits<T>::max()),
-            value));
-    PADDLE_ENFORCE_EQ(
-        std::isnan(value),
-        false,
-        platform::errors::InvalidArgument("The filled value is NaN."));
-    auto value_t = static_cast<T>(value);
-    MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-    MLUCnnl::Fill(
-        ctx, CNNL_POINTER_MODE_HOST, &value_t, out_desc.get(), GetBasePtr(out));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(fill_any_like,
-                       ops::FillAnyLikeMLUKernel<int>,
-                       ops::FillAnyLikeMLUKernel<int64_t>,
-                       ops::FillAnyLikeMLUKernel<float>,
-                       ops::FillAnyLikeMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class FillConstantBatchSizeLikeOpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto float_value = ctx.Attr<float>("value");
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-    auto *in = ctx.Input<phi::DenseTensor>("Input");
-    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
-      // set the correct batch size for the phi::DenseTensor.
-      auto odims = out->dims();
-      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
-      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
-      out->mutable_data<T>(odims, ctx.GetPlace());
-    }
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream.
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<double>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<double>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        if (std::is_same<int64_t, T>::value) {
-          int64_t tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        } else {
-          double tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        }
-      }
-    }
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
-    if (cpu_place) {
-      auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      phi::funcs::SetConstant<phi::CPUContext, T> functor;
-      out->mutable_data(platform::CPUPlace(),
-                        framework::TransToPhiDataType(data_type));
-      functor(reinterpret_cast<const phi::CPUContext &>(dev_ctx),
-              out,
-              static_cast<T>(value));
-    } else {
-      out->mutable_data(ctx.GetPlace(),
-                        framework::TransToPhiDataType(data_type));
-      const T *value_data = &value;
-      cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST;
-      MLUCnnlTensorDesc output_desc(*out);
-      MLUCnnl::Fill(
-          ctx, pointer_mode, value_data, output_desc.get(), GetBasePtr(out));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(
-    fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpMLUKernel<int>,
-    ops::FillConstantBatchSizeLikeOpMLUKernel<float>,
-    ops::FillConstantBatchSizeLikeOpMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/fill_constant_op_mlu.cc
+++ b/paddle/fluid/operators/fill_constant_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/utils.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class FillConstantMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto float_value = ctx.Attr<float>("value");
-    auto *out_var = ctx.Output<phi::DenseTensor>("Out");
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream.
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<double>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<double>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        if (std::is_same<int64_t, T>::value) {
-          int64_t tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        } else {
-          double tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        }
-      }
-    }
-    const T *value_data = &value;
-    cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST;
-    if (ctx.HasInput("ValueTensor")) {
-      auto *value_tensor = ctx.Input<phi::DenseTensor>("ValueTensor");
-      PADDLE_ENFORCE_EQ(
-          value_tensor->numel(),
-          1,
-          platform::errors::InvalidArgument(
-              "When use phi::DenseTensor as value to set phi::DenseTensor "
-              "value in fill_cosntant, "
-              "value input(ValueTensor) size must be 1, but get %d",
-              value_tensor->numel()));
-      value_data = value_tensor->data<T>();
-      auto tmp_place = value_tensor->place();
-      if (platform::is_mlu_place(tmp_place)) {
-        pointer_mode = CNNL_POINTER_MODE_DEVICE;
-      }
-    }
-    auto shape = GetShape(ctx);
-    out_var->mutable_data<T>(shape, ctx.GetPlace());
-    MLUCnnlTensorDesc output_desc(*out_var);
-    MLUCnnl::Fill(
-        ctx, pointer_mode, value_data, output_desc.get(), GetBasePtr(out_var));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(
-    fill_constant,
-    paddle::operators::FillConstantMLUKernel<float>,
-    paddle::operators::FillConstantMLUKernel<bool>,
-    paddle::operators::FillConstantMLUKernel<int>,
-    paddle::operators::FillConstantMLUKernel<uint8_t>,
-    paddle::operators::FillConstantMLUKernel<int16_t>,
-    paddle::operators::FillConstantMLUKernel<int64_t>,
-    paddle::operators::FillConstantMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/flatten_op_mlu.cc
+++ b/paddle/fluid/operators/flatten_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/flatten_op.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class FlattenMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<phi::DenseTensor>("X");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-    auto &axes = context.Attr<int>("axis");
-    auto x_dims = in->dims();
-    auto out_dims = phi::make_ddim(GetOutputShape(axes, x_dims));
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        out);
-    out->Resize(out_dims);
-  }
-  static std::vector<int32_t> GetOutputShape(const int axis,
-                                             const framework::DDim &in_dims) {
-    int64_t outer = 1, inner = 1;
-    for (int i = 0; i < in_dims.size(); ++i) {
-      if (i < axis) {
-        outer *= in_dims[i];
-      } else {
-        inner *= in_dims[i];
-      }
-    }
-    std::vector<int32_t> out_shape(2);
-    out_shape[0] = outer;
-    out_shape[1] = inner;
-    return out_shape;
-  }
-};
-template <typename DeviceContext, typename T>
-class FlattenGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto in_dims = ctx.Input<phi::DenseTensor>("X")->dims();
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::MLUDeviceContext>(),
-        d_x);
-    d_x->Resize(in_dims);
-  }
-};
-template <typename DeviceContext, typename T>
-class Flatten2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &axes = context.Attr<int>("axis");
-    auto *in = context.Input<phi::DenseTensor>("X");
-    auto x_dims = in->dims();
-    auto *out = context.Output<phi::DenseTensor>("Out");
-    auto out_dims = phi::make_ddim(
-        FlattenMLUKernel<DeviceContext, T>::GetOutputShape(axes, x_dims));
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        out);
-    out->Resize(out_dims);
-  }
-};
-template <typename DeviceContext, typename T>
-class Flatten2GradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        d_x);
-    d_x->Resize(x_dims);
-  }
-};
-template <typename DeviceContext, typename T>
-class FlattenContiguousRangeMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<phi::DenseTensor>("X");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-    out->mutable_data(context.GetPlace(), in->type());
-    auto &start_axis = context.Attr<int>("start_axis");
-    auto &stop_axis = context.Attr<int>("stop_axis");
-    // make out dims
-    auto in_dims = in->dims();
-    auto out_dims =
-        phi::make_ddim(GetOutputShape(start_axis, stop_axis, in_dims));
-    framework::TensorCopy(
-        *in,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        out);
-    out->Resize(out_dims);
-  }
-  static std::vector<int32_t> GetOutputShape(const int start_axis,
-                                             const int stop_axis,
-                                             const framework::DDim &in_dims) {
-    int64_t outer = 1;
-    std::vector<int32_t> out_shape;
-    int in_dims_size = in_dims.size();
-    out_shape.reserve(in_dims_size - stop_axis + start_axis);
-    int real_start_axis = start_axis, real_stop_axis = stop_axis;
-    if (start_axis < 0) {
-      real_start_axis = start_axis + in_dims_size;
-    }
-    if (stop_axis < 0) {
-      real_stop_axis = stop_axis + in_dims_size;
-    }
-    for (int i = 0; i < real_start_axis; ++i) {
-      out_shape.push_back(in_dims[i]);
-    }
-    for (int i = real_start_axis; i <= real_stop_axis; i++) {
-      if (in_dims[i] == -1 || outer == -1) {
-        outer = -1;
-      } else {
-        outer *= in_dims[i];
-      }
-    }
-    out_shape.push_back(outer);
-    for (int i = real_stop_axis + 1; i < in_dims_size; i++) {
-      out_shape.push_back(in_dims[i]);
-    }
-    return out_shape;
-  }
-};
-template <typename DeviceContext, typename T>
-class FlattenContiguousRangeGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<paddle::platform::MLUDeviceContext>(),
-        d_x);
-    d_x->Resize(x_dims);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(
-    flatten,
-    ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, double>,
-    ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
-    ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, int>,
-    ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, int8_t>,
-    ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    flatten_grad,
-    ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, double>,
-    ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
-    ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, int>,
-    ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, int8_t>,
-    ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    flatten2,
-    ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, double>,
-    ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
-    ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, int>,
-    ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, int8_t>,
-    ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    flatten2_grad,
-    ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, double>,
-    ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
-    ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, int>,
-    ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, int8_t>,
-    ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    flatten_contiguous_range,
-    ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
-                                         float>,
-    ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
-                                         double>,
-    ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
-                                         uint8_t>,
-    ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
-                                         int>,
-    ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
-                                         int8_t>,
-    ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
-                                         int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    flatten_contiguous_range_grad,
-    ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                             float>,
-    ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                             double>,
-    ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                             uint8_t>,
-    ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                             int>,
-    ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                             int8_t>,
-    ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                             int64_t>);
--- a/paddle/fluid/operators/gather_nd_op_mlu.cc
+++ b/paddle/fluid/operators/gather_nd_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/platform/device_context.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class GatherNdMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-    out->template mutable_data<T>(place);
-    if (x->numel() == 0) return;
-    if (index->numel() == 0) {
-      auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-      framework::TensorCopy(*x, place, dev_ctx, out);
-      return;
-    }
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc index_desc(*index);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::GatherNd(ctx,
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      index_desc.get(),
-                      GetBasePtr(index),
-                      out_desc.get(),
-                      GetBasePtr(out));
-  }
-};
-template <typename T>
-class GatherNdGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    if (dx->numel() == 0) return;
-    if (index->numel() == 0) {
-      auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-      framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
-      return;
-    }
-    phi::DenseTensor tmp_tensor(index->type());
-    phi::DenseTensor tmp_tensor2(dout->type());
-    const auto index_dims = index->dims();
-    if (index_dims.size() == 1) {
-      tmp_tensor.ShareDataWith(*index);
-      std::vector<int64_t> new_dim = {1, index_dims[0]};
-      tmp_tensor.Resize(phi::make_ddim(new_dim));
-      index = &tmp_tensor;
-      tmp_tensor2.ShareDataWith(*dout);
-      std::vector<int64_t> new_dim2{1};
-      for (int i = index->numel(); i < x->dims().size(); i++) {
-        new_dim2.push_back(x->dims()[i]);
-      }
-      tmp_tensor2.Resize(phi::make_ddim(new_dim2));
-      dout = &tmp_tensor2;
-    }
-    dx->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc dx_desc(*dx);
-    auto value = static_cast<T>(0);
-    MLUCnnl::Fill(
-        ctx, CNNL_POINTER_MODE_HOST, &value, dx_desc.get(), GetBasePtr(dx));
-    MLUCnnlTensorDesc index_desc(*index);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    const cnnlScatterNdMode_t mode = CNNL_SCATTERND_ADD;
-    MLUCnnl::ScatterNd(ctx,
-                       mode,
-                       index_desc.get(),
-                       GetBasePtr(index),
-                       dout_desc.get(),
-                       GetBasePtr(dout),
-                       dx_desc.get(),
-                       GetBasePtr(dx),
-                       dx_desc.get(),
-                       GetBasePtr(dx));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(gather_nd,
-                       ops::GatherNdMLUKernel<float>,
-                       ops::GatherNdMLUKernel<paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(gather_nd_grad,
-                       ops::GatherNdGradMLUKernel<paddle::platform::float16>,
-                       ops::GatherNdGradMLUKernel<float>);
--- a/paddle/fluid/operators/gather_op_mlu.cc
+++ b/paddle/fluid/operators/gather_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class GatherOpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto axis = ctx.Attr<int>("axis");
-    const auto index_dims = index->dims();
-    if (index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(
-          index_dims[1],
-          1,
-          platform::errors::InvalidArgument(
-              "The last dim of index should be 1 when it is 2D, but we get %d",
-              index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          index_dims.size(),
-          1,
-          platform::errors::InvalidArgument(
-              "The index should be 1D, when it is not 2D, but we get %d",
-              index_dims.size()));
-    }
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc x_desc(*x);
-    int index_shape_1d[1] = {static_cast<int>(index_dims[0])};
-    MLUCnnlTensorDesc index_desc(
-        1, index_shape_1d, ToCnnlDataType(index->dtype()));
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::GatherFunctor(ctx,
-                           axis,
-                           0 /*batch_dims*/,
-                           x_desc.get(),
-                           GetBasePtr(x),
-                           index_desc.get(),
-                           GetBasePtr(index),
-                           out_desc.get(),
-                           GetBasePtr(out));
-  }
-};
-template <typename T>
-class GatherGradOpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    const auto index_dims = index->dims();
-    if (index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(
-          index_dims[1],
-          1,
-          platform::errors::InvalidArgument(
-              "The last dim of index should be 1 when it is 2D, but we get %d",
-              index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          index_dims.size(),
-          1,
-          platform::errors::InvalidArgument(
-              "The index should be 1D, when it is not 2D, but we get %d",
-              index_dims.size()));
-    }
-    dx->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc dx_desc(*dx);
-    auto value = static_cast<T>(0);
-    MLUCnnl::Fill(
-        ctx, CNNL_POINTER_MODE_HOST, &value, dx_desc.get(), GetBasePtr(dx));
-    int index_shape_1d[1] = {static_cast<int>(index_dims[0])};
-    MLUCnnlTensorDesc index_desc(
-        1, index_shape_1d, ToCnnlDataType(index->dtype()));
-    MLUCnnlTensorDesc dout_desc(*dout);
-    const cnnlScatterRefMode_t mode = CNNL_SCATTERREF_UPDATE;
-    MLUCnnl::ScatterRefFunctor(ctx,
-                               dx_desc.get(),
-                               GetBasePtr(dx),
-                               dout_desc.get(),
-                               GetBasePtr(dout),
-                               index_desc.get(),
-                               GetBasePtr(index),
-                               mode);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(gather,
-                       ops::GatherOpMLUKernel<float>,
-                       ops::GatherOpMLUKernel<paddle::platform::float16>,
-                       ops::GatherOpMLUKernel<int>);
-REGISTER_OP_MLU_KERNEL(gather_grad,
-                       ops::GatherGradOpMLUKernel<float>,
-                       ops::GatherGradOpMLUKernel<paddle::platform::float16>,
-                       ops::GatherGradOpMLUKernel<int>);
--- a/paddle/fluid/operators/gaussian_random_op_mlu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <random>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/generator.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class MLUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<phi::DenseTensor>("Out");
-    tensor->mutable_data<T>(context.GetPlace());
-    phi::DenseTensor cpu_tensor(tensor->type());
-    cpu_tensor.Resize(tensor->dims());
-    T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-    std::normal_distribution<T> dist(mean, std);
-    int64_t size = tensor->numel();
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = phi::GetCPURandomEngine(seed);
-    for (int64_t i = 0; i < size; ++i) {
-      cpu_data[i] = dist(*engine);
-    }
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::MLUDeviceContext>();
-    framework::TensorCopy(cpu_tensor, context.GetPlace(), dev_ctx, tensor);
-    dev_ctx.Wait();
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(gaussian_random, ops::MLUGaussianRandomKernel<float>);
--- a/paddle/fluid/operators/grid_sampler_op_mlu.cc
+++ b/paddle/fluid/operators/grid_sampler_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class GridSamplerMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_mlu_place(ctx.GetPlace()),
-        true,
-        platform::errors::Unavailable("This kernel only runs on MLU."));
-    // input and output data
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor* grid = ctx.Input<phi::DenseTensor>("Grid");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-    int n = input->dims()[0];
-    int c = input->dims()[1];
-    int out_h = grid->dims()[1];
-    int out_w = grid->dims()[2];
-    output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-    // attrs
-    // paddle.nn.functional.grid_sample(x, grid, mode='bilinear',
-    // padding_mode='zeros', align_corners=True, name=None)
-    const std::string mode = ctx.Attr<std::string>("mode");
-    const std::string padding_mode = ctx.Attr<std::string>("padding_mode");
-    bool align_corners = ctx.Attr<bool>("align_corners");
-    const std::string data_format = phi::DataLayoutToString(input->layout());
-    PADDLE_ENFORCE_EQ(
-        mode == "bilinear",
-        true,
-        platform::errors::Unavailable(
-            "Only support bilinear mode in mlu grid_sample kernel."));
-    PADDLE_ENFORCE_EQ(
-        padding_mode == "zeros",
-        true,
-        platform::errors::Unavailable(
-            "Only support zeros padding_mode in mlu grid_sample kernel."));
-    phi::DenseTensor trans_input(input->dtype());
-    // transpose input from NCHW to NHWC
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    TransposeFromMLUTensor<T>(
-        ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
-    phi::DenseTensor tmp_output(output->dtype());
-    tmp_output.mutable_data<T>({n, out_h, out_w, c}, ctx.GetPlace());
-    MLUCnnlGridSampleDesc grid_sample_desc(mode, padding_mode, align_corners);
-    MLUCnnlTensorDesc input_desc(
-        trans_input, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
-    MLUCnnlTensorDesc grid_desc(*grid, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
-    MLUCnnlTensorDesc tmp_output_desc(
-        tmp_output, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
-    MLUCnnl::GridSample(ctx,
-                        grid_sample_desc.get(),
-                        input_desc.get(),
-                        GetBasePtr(&trans_input),
-                        grid_desc.get(),
-                        GetBasePtr(grid),
-                        tmp_output_desc.get(),
-                        GetBasePtr(&tmp_output));
-    // transpose output from NHWC to NCHW
-    const std::vector<int> perm_to_nchw = {
-        0,
-        3,
-        1,
-        2,
-    };
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nchw,
-                              &tmp_output,
-                              output,
-                              false /*need_reshape_or_alloc*/);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(grid_sampler,
-                       ops::GridSamplerMLUKernel<float>,
-                       ops::GridSamplerMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/huber_loss_op_mlu.cc
+++ b/paddle/fluid/operators/huber_loss_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class HuberLossMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = GetDevCtxFromCTX(ctx);
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* residual = ctx.Output<phi::DenseTensor>("Residual");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto delta = ctx.Attr<float>("delta");
-    auto place = ctx.GetPlace();
-    // compute y-x
-    cnnlDataType_t data_type = ToCnnlDataType<T>();
-    residual->mutable_data<T>(x->dims(), place);
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlOpTensorDesc sub_op_desc(
-        CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx,
-                      sub_op_desc.get(),
-                      x_desc.get(),
-                      GetBasePtr(y),
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      x_desc.get(),
-                      GetBasePtr(residual),
-                      data_type);
-    // compute smoothl1loss
-    out->mutable_data<T>(x->dims(), place);
-    cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
-        CNNL_SMOOTHL1LOSS_REDUCTION_NONE;  // defines whether to do reduction
-                                           // here
-    MLUCnnl::SmoothL1LossForward(ctx,
-                                 x_desc.get(),
-                                 GetBasePtr(x),
-                                 x_desc.get(), /* target has same shape as x */
-                                 GetBasePtr(y),
-                                 static_cast<float>(delta),
-                                 smoothl1_algo,
-                                 x_desc.get(), /* out has same shape as x */
-                                 GetBasePtr(out));
-    // compute multiply by delta
-    phi::DenseTensor scale_tensor, bias_tensor;
-    scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
-    const int axis = std::max(out->dims().size() - 1, 0);
-    MLUCnnlTensorDesc scale_desc(scale_tensor);
-    MLUCnnlTensorDesc bias_desc(bias_tensor);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::Scale(ctx,
-                   axis,
-                   out_desc.get(),
-                   GetBasePtr(out),
-                   scale_desc.get(),
-                   GetBasePtr(&scale_tensor),
-                   bias_desc.get(),
-                   GetBasePtr(&bias_tensor),
-                   out_desc.get(),
-                   GetBasePtr(out));
-  }
-};
-template <typename T>
-class HuberLossGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = GetDevCtxFromCTX(ctx);
-    auto* residual = ctx.Input<phi::DenseTensor>("Residual");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto delta = ctx.Attr<float>("delta");
-    auto place = ctx.GetPlace();
-    phi::DenseTensor t_grad_rd;
-    t_grad_rd =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
-    MLUCnnlTensorDesc t_grad_rd_desc(t_grad_rd);
-    if (dx || dy) {
-      phi::DenseTensor t_zero;
-      t_zero =
-          ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
-      FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &t_zero);
-      MLUCnnlTensorDesc residual_desc(*residual);
-      MLUCnnlTensorDesc dout_desc(*dout);
-      cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
-          CNNL_SMOOTHL1LOSS_REDUCTION_NONE;  // defines whether to do reduction
-                                             // here
-      MLUCnnl::SmoothL1LossBackward(ctx,
-                                    residual_desc.get(),
-                                    GetBasePtr(residual),
-                                    residual_desc.get(),
-                                    GetBasePtr(&t_zero),
-                                    dout_desc.get(),
-                                    GetBasePtr(dout),
-                                    static_cast<float>(delta),
-                                    smoothl1_algo,
-                                    t_grad_rd_desc.get(),
-                                    GetBasePtr(&t_grad_rd));
-    }
-    // compute multiply by delta
-    phi::DenseTensor scale_tensor, bias_tensor;
-    scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
-    const int axis = std::max(t_grad_rd.dims().size() - 1, 0);
-    MLUCnnlTensorDesc scale_desc(scale_tensor);
-    MLUCnnlTensorDesc bias_desc(bias_tensor);
-    if (dx) {
-      dx->mutable_data<T>(place);
-      FillMLUTensorWithHostValue(ctx, static_cast<T>(-delta), &scale_tensor);
-      MLUCnnlTensorDesc out_desc(*dx);
-      MLUCnnl::Scale(ctx,
-                     axis,
-                     t_grad_rd_desc.get(),
-                     GetBasePtr(&t_grad_rd),
-                     scale_desc.get(),
-                     GetBasePtr(&scale_tensor),
-                     bias_desc.get(),
-                     GetBasePtr(&bias_tensor),
-                     out_desc.get(),
-                     GetBasePtr(dx));
-    }
-    if (dy) {
-      dy->mutable_data<T>(place);
-      FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
-      MLUCnnlTensorDesc out_desc(*dy);
-      MLUCnnl::Scale(ctx,
-                     axis,
-                     t_grad_rd_desc.get(),
-                     GetBasePtr(&t_grad_rd),
-                     scale_desc.get(),
-                     GetBasePtr(&scale_tensor),
-                     bias_desc.get(),
-                     GetBasePtr(&bias_tensor),
-                     out_desc.get(),
-                     GetBasePtr(dy));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(huber_loss,
-                       ops::HuberLossMLUKernel<float>,
-                       ops::HuberLossMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(huber_loss_grad,
-                       ops::HuberLossGradMLUKernel<float>,
-                       ops::HuberLossGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/interpolate_v2_op_mlu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/interpolate_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/phi/core/tensor_utils.h"
-namespace paddle {
-namespace operators {
-using DataLayout = phi::DataLayout;
-inline std::vector<int> get_new_shape_mlu(
-    const std::vector<const phi::DenseTensor*>& list_new_shape_tensor) {
-  // get tensor from
-  std::vector<int> vec_new_shape;
-  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
-    auto tensor = list_new_shape_tensor[i];
-    PADDLE_ENFORCE_EQ(
-        tensor->dims(),
-        phi::make_ddim({1}),
-        platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
-    phi::DenseTensor temp;
-    paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-    vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
-  }
-  return vec_new_shape;
-}
-template <typename T>
-class InterpolateV2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto input_dims = input->dims();
-    PADDLE_ENFORCE_GE(
-        input_dims.size(),
-        4,
-        platform::errors::External("MLU Interpolate kernel supports input "
-                                   "range greater or equal than 4."));
-    PADDLE_ENFORCE_LE(
-        input_dims.size(),
-        5,
-        platform::errors::External("MLU Interpolate kernel supports input "
-                                   "range less or equal than 5. "));
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    int n, c, in_d, in_h, in_w;
-    ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w);
-    auto interp_method = ctx.Attr<std::string>("interp_method");
-    bool align_corners = ctx.Attr<bool>("align_corners");
-    int align_mode = ctx.Attr<int>("align_mode");
-    int align_center = align_corners ? 0 : (align_mode == 1 ? 0 : 1);
-    int out_d = ctx.Attr<int>("out_d");
-    int out_h = ctx.Attr<int>("out_h");
-    int out_w = ctx.Attr<int>("out_w");
-    float scale_d = -1;
-    float scale_h = -1;
-    float scale_w = -1;
-    auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
-    if (list_new_size_tensor.size() > 0) {
-      // have size tensor
-      auto new_size = get_new_shape_mlu(list_new_size_tensor);
-      if (new_size.size() <= 2) {
-        // default NCHW
-        out_h = new_size[0];
-        out_w = new_size[1];
-      } else {
-        // rank of input is 5, HCDHW
-        out_d = new_size[0];
-        out_h = new_size[1];
-        out_w = new_size[2];
-      }
-    } else {
-      auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
-      auto scale = ctx.Attr<std::vector<float>>("scale");
-      if (scale_tensor != nullptr) {
-        std::vector<float> scale_data;
-        scale_data = phi::GetVectorFromTensor<float>(scale_tensor);
-        if (scale_data.size() > 1 && scale_data.size() <= 2) {
-          scale_h = scale_data[0];
-          scale_w = scale_data[1];
-        } else if (scale_data.size() > 2) {
-          scale_d = scale_data[0];
-          scale_h = scale_data[1];
-          scale_w = scale_data[2];
-        } else {
-          scale_d = scale_data[0];
-          scale_h = scale_data[0];
-          scale_w = scale_data[0];
-        }
-        PADDLE_ENFORCE_EQ(
-            scale_w > 0 && scale_h > 0,
-            true,
-            platform::errors::InvalidArgument("scale of Op(interpolate) "
-                                              "should be greater than 0."));
-      } else {
-        if (scale.size() > 1 && scale.size() <= 2) {
-          scale_h = scale[0];
-          scale_w = scale[1];
-          PADDLE_ENFORCE_EQ(
-              scale_w > 0 && scale_h > 0,
-              true,
-              platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                                "should be greater than 0."));
-        } else if (scale.size() > 2) {
-          scale_d = scale[0];
-          scale_h = scale[1];
-          scale_w = scale[2];
-          PADDLE_ENFORCE_EQ(
-              scale_d > 0 && scale_w > 0 && scale_h > 0,
-              true,
-              platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                                "should be greater than 0."));
-        }
-      }
-      if (scale_h > 0. && scale_w > 0.) {
-        out_h = static_cast<int>(in_h * scale_h);
-        out_w = static_cast<int>(in_w * scale_w);
-      }
-      if (scale_d > 0.) {
-        out_d = static_cast<int>(in_d * scale_d);
-      }
-      auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
-      if (out_size != nullptr) {
-        std::vector<int32_t> out_size_data;
-        out_size_data = phi::GetVectorFromTensor<int>(out_size);
-        if (out_size_data.size() <= 2) {
-          out_h = out_size_data[0];
-          out_w = out_size_data[1];
-        } else {
-          out_d = out_size_data[0];
-          out_h = out_size_data[1];
-          out_w = out_size_data[2];
-        }
-      }
-    }
-    PADDLE_ENFORCE_GT(
-        out_h,
-        0,
-        platform::errors::InvalidArgument("out_h in Attr(out_shape) of "
-                                          "Op(interpolate) "
-                                          "should be greater than 0."));
-    PADDLE_ENFORCE_GT(
-        out_w,
-        0,
-        platform::errors::InvalidArgument("out_w in Attr(out_shape) of "
-                                          "Op(interpolate) "
-                                          "should be greater than 0."));
-    // do transpose according to cnnl's constraints
-    // cnnlInterp_v2 only accepts NHWC when mode is CNNL_INTERP_BILINEAR and
-    // CNNL_INTERP_NEAREST,
-    framework::DDim dim_in, dim_in_trans, dim_out, dim_out_trans;
-    phi::DenseTensor transformed_input, transformed_output;
-    bool need_transpose = input_dims.size() != 2;
-    if (input_dims.size() == 4) {
-      // need to do transpose if layout is kNCHW
-      need_transpose &= data_layout == DataLayout::kNCHW;
-      if (need_transpose) {
-        // if need_transpose, do the following
-        // 1. transpose input NCHW -> NHWC
-        // 2. interpolation in(NHWC) -> out(NHWC)
-        // 3. transpose output NHWC -> HCHW
-        // dim_in = {n, c, in_h, in_w};
-        dim_in_trans = {n, in_h, in_w, c};
-        dim_out = {n, c, out_h, out_w};
-        dim_out_trans = {n, out_h, out_w, c};
-        output->mutable_data<T>(dim_out, ctx.GetPlace());
-        if (in_h == out_h && in_w == out_w) {
-          framework::TensorCopy(*input, ctx.GetPlace(), output);
-          return;
-        }
-        // do transpose on input tensor, then do interpolation
-        MLUCnnlTensorDesc input_desc(
-            *input, CNNL_LAYOUT_NCHW, ToCnnlDataType(input->dtype()));
-        transformed_input =
-            ctx.AllocateTmpTensor<T, MLUDeviceContext>(dim_in_trans, dev_ctx);
-        transformed_output =
-            ctx.AllocateTmpTensor<T, MLUDeviceContext>(dim_out_trans, dev_ctx);
-        MLUCnnlTensorDesc input_reshaped_desc(
-            transformed_input,
-            CNNL_LAYOUT_NHWC,
-            ToCnnlDataType(transformed_input.dtype()));
-        const std::vector<int> perm = {0, 2, 3, 1};
-        MLUCnnl::Transpose(ctx,
-                           perm,
-                           input_dims.size(),
-                           input_desc.get(),
-                           GetBasePtr(input),
-                           input_reshaped_desc.get(),
-                           GetBasePtr(&transformed_input));
-      } else {
-        // if no need_transpose, do the following
-        // 1. interpolation in(NHWC) -> out(NHWC)
-        // dim_in = {n, in_h, in_w, c};
-        dim_out = {n, out_h, out_w, c};
-        output->mutable_data<T>(dim_out, ctx.GetPlace());
-        if (in_h == out_h && in_w == out_w) {
-          framework::TensorCopy(*input, ctx.GetPlace(), output);
-          return;
-        }
-        transformed_input = *input;
-        transformed_output = *output;
-      }
-      MLUCnnlTensorDesc input_desc(transformed_input,
-                                   CNNL_LAYOUT_NHWC,
-                                   ToCnnlDataType(transformed_input.dtype()));
-      MLUCnnlTensorDesc output_desc(transformed_output,
-                                    CNNL_LAYOUT_NHWC,
-                                    ToCnnlDataType(transformed_output.dtype()));
-      MLUCnnl::Interp(ctx,
-                      GetMLUCnnlInterpMode(interp_method),
-                      align_corners,
-                      align_center,
-                      input_desc.get(),
-                      GetBasePtr(&transformed_input),
-                      output_desc.get(),
-                      GetBasePtr(&transformed_output));
-      if (need_transpose) {
-        // if need_transpose, reshape output back to NCHW
-        const std::vector<int> perm = {0, 3, 1, 2};
-        MLUCnnlTensorDesc output_reshape_desc(
-            *output, CNNL_LAYOUT_NCHW, ToCnnlDataType(output->dtype()));
-        MLUCnnl::Transpose(ctx,
-                           perm,
-                           dim_out_trans.size(),
-                           output_desc.get(),
-                           GetBasePtr(&transformed_output),
-                           output_reshape_desc.get(),
-                           GetBasePtr(output));
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          interp_method,
-          "trilinear",
-          platform::errors::External("MLU Interpolate kernel only supports 5D "
-                                     "data in trilinear mode."));
-      // need to do transpose if layout is kNCDHW
-      need_transpose &= data_layout == DataLayout::kNCHW;
-      if (need_transpose) {
-        // if need_transpose, do the following
-        // 1. transpose input NCDHW -> NDHWC
-        // 2. interpolation in(NDHWC) -> out(NDHWC)
-        // 3. transpose output NDHWC -> HCDHW
-        // dim_in = {n, c, in_d, in_h, in_w};
-        dim_in_trans = {n, in_d, in_h, in_w, c};
-        dim_out = {n, c, out_d, out_h, out_w};
-        dim_out_trans = {n, out_d, out_h, out_w, c};
-        output->mutable_data<T>(dim_out, ctx.GetPlace());
-        if (in_h == out_h && in_w == out_w && in_d == out_d) {
-          framework::TensorCopy(*input, ctx.GetPlace(), output);
-          return;
-        }
-        // do transpose on input tensor (HCDHW -> NDHWC), then do interpolation
-        MLUCnnlTensorDesc input_desc(
-            *input, CNNL_LAYOUT_NCDHW, ToCnnlDataType(input->dtype()));
-        transformed_input =
-            ctx.AllocateTmpTensor<T, MLUDeviceContext>(dim_in_trans, dev_ctx);
-        transformed_output =
-            ctx.AllocateTmpTensor<T, MLUDeviceContext>(dim_out_trans, dev_ctx);
-        MLUCnnlTensorDesc input_reshaped_desc(
-            transformed_input,
-            CNNL_LAYOUT_NDHWC,
-            ToCnnlDataType(transformed_input.dtype()));
-        const std::vector<int> perm = {0, 2, 3, 4, 1};
-        MLUCnnl::Transpose(ctx,
-                           perm,
-                           input_dims.size(),
-                           input_desc.get(),
-                           GetBasePtr(input),
-                           input_reshaped_desc.get(),
-                           GetBasePtr(&transformed_input));
-      } else {
-        // if no need_transpose, do the following
-        // 1. interpolation in(NDHWC) -> out(NDHWC)
-        // dim_in = {n, in_d, in_h, in_w, c};
-        dim_out = {n, out_d, out_h, out_w, c};
-        output->mutable_data<T>(dim_out, ctx.GetPlace());
-        if (in_h == out_h && in_w == out_w && in_d == out_d) {
-          framework::TensorCopy(*input, ctx.GetPlace(), output);
-          return;
-        }
-        transformed_input = *input;
-        transformed_output = *output;
-      }
-      MLUCnnlTensorDesc input_desc(transformed_input,
-                                   CNNL_LAYOUT_NDHWC,
-                                   ToCnnlDataType(transformed_input.dtype()));
-      MLUCnnlTensorDesc output_desc(transformed_output,
-                                    CNNL_LAYOUT_NDHWC,
-                                    ToCnnlDataType(transformed_output.dtype()));
-      // use trilinear mode in HCDHW layout
-      MLUCnnl::Interp(ctx,
-                      GetMLUCnnlInterpMode(interp_method),
-                      align_corners,
-                      align_center,
-                      input_desc.get(),
-                      GetBasePtr(&transformed_input),
-                      output_desc.get(),
-                      GetBasePtr(&transformed_output));
-      if (need_transpose) {
-        // if need_transpose, reshape output back (NDHWC -> NCDHW)
-        const std::vector<int> perm = {0, 4, 1, 2, 3};
-        MLUCnnlTensorDesc output_reshape_desc(
-            *output, CNNL_LAYOUT_NCDHW, ToCnnlDataType(output->dtype()));
-        MLUCnnl::Transpose(ctx,
-                           perm,
-                           dim_out_trans.size(),
-                           output_desc.get(),
-                           GetBasePtr(&transformed_output),
-                           output_reshape_desc.get(),
-                           GetBasePtr(output));
-      }
-    }
-  }
-};
-template <typename T>
-class InterpolateV2GradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto output_grad_dims = output_grad->dims();
-    PADDLE_ENFORCE_EQ(output_grad_dims.size(),
-                      4,
-                      platform::errors::External(
-                          "XPU Interpolategrad kernel only support 2d"));
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto input_dims = input->dims();
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    int n, c, in_d, in_h, in_w;
-    ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-    auto interp_method = ctx.Attr<std::string>("interp_method");
-    bool align_corners = ctx.Attr<bool>("align_corners");
-    int align_mode = ctx.Attr<int>("align_mode");
-    int align_center = align_corners ? 0 : (align_mode == 0 ? 0 : 1);
-    align_center = 0;
-    int out_h = ctx.Attr<int>("out_h");
-    int out_w = ctx.Attr<int>("out_w");
-    float scale_h = -1;
-    float scale_w = -1;
-    auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
-    if (list_new_size_tensor.size() > 0) {
-      // have size tensor
-      auto new_size = get_new_shape_mlu(list_new_size_tensor);
-      out_h = new_size[0];
-      out_w = new_size[1];
-    } else {
-      auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
-      auto scale = ctx.Attr<std::vector<float>>("scale");
-      if (scale_tensor != nullptr) {
-        std::vector<float> scale_data;
-        scale_data = phi::GetVectorFromTensor<float>(scale_tensor);
-        if (scale_data.size() > 1) {
-          scale_h = scale_data[0];
-          scale_w = scale_data[1];
-        } else {
-          scale_h = scale_data[0];
-          scale_w = scale_data[0];
-        }
-        PADDLE_ENFORCE_EQ(
-            scale_w > 0 && scale_h > 0,
-            true,
-            platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                              "should be greater than 0."));
-      } else {
-        if (scale.size() > 1) {
-          scale_h = scale[0];
-          scale_w = scale[1];
-          PADDLE_ENFORCE_EQ(
-              scale_w > 0 && scale_h > 0,
-              true,
-              platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                                "should be greater than 0."));
-        }
-      }
-      if (scale_h > 0. && scale_w > 0.) {
-        out_h = static_cast<int>(in_h * scale_h);
-        out_w = static_cast<int>(in_w * scale_w);
-      }
-      auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
-      if (out_size != nullptr) {
-        std::vector<int32_t> out_size_data;
-        out_size_data = phi::GetVectorFromTensor<int>(out_size);
-        out_h = out_size_data[0];
-        out_w = out_size_data[1];
-      }
-    }
-    framework::DDim dim_grad;
-    framework::DDim dim_out_grad, dim_out_trans_grad, dim_in_grad,
-        dim_in_trans_grad;
-    phi::DenseTensor transformed_output_grad, transformed_input_grad;
-    bool need_transpose =
-        input_dims.size() != 2 && data_layout == DataLayout::kNCHW;
-    if (need_transpose) {
-      // if need_transpose, do the following
-      // 1. transpose output_grad NCHW -> NHWC
-      // 2. InterpBackward output_grad(NHWC) -> input_grad(NHWC)
-      // 3. transpose input_grad NHWC -> HCHW
-      // dim_out_grad = {n, c, out_h, out_w};
-      dim_out_trans_grad = {n, out_h, out_w, c};
-      dim_in_grad = {n, c, in_h, in_w};
-      dim_in_trans_grad = {n, in_h, in_w, c};
-      input_grad->mutable_data<T>(dim_in_grad, ctx.GetPlace());
-      if (in_h == out_h && in_w == out_w) {
-        framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
-        return;
-      }
-      // do transpose on input tensor, then do interpolation
-      MLUCnnlTensorDesc input_desc(
-          *output_grad, CNNL_LAYOUT_NCHW, ToCnnlDataType(output_grad->dtype()));
-      transformed_output_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          dim_out_trans_grad, dev_ctx);
-      transformed_input_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          dim_in_trans_grad, dev_ctx);
-      MLUCnnlTensorDesc input_reshaped_desc(
-          transformed_output_grad,
-          CNNL_LAYOUT_NHWC,
-          ToCnnlDataType(transformed_output_grad.dtype()));
-      const std::vector<int> perm = {0, 2, 3, 1};
-      MLUCnnl::Transpose(ctx,
-                         perm,
-                         input_dims.size(),
-                         input_desc.get(),
-                         GetBasePtr(output_grad),
-                         input_reshaped_desc.get(),
-                         GetBasePtr(&transformed_output_grad));
-    } else {
-      // if no need_transpose, do the following
-      // 1. InterpBackward output_grad(NHWC) -> input_grad(NHWC)
-      dim_in_grad = {n, in_h, in_w, c};
-      input_grad->mutable_data<T>(dim_in_grad, ctx.GetPlace());
-      if (in_h == out_h && in_w == out_w) {
-        framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
-        return;
-      }
-      transformed_output_grad = *output_grad;
-      transformed_input_grad = *input_grad;
-    }
-    MLUCnnlTensorDesc input_desc(
-        transformed_output_grad,
-        CNNL_LAYOUT_NHWC,
-        ToCnnlDataType(transformed_output_grad.dtype()));
-    MLUCnnlTensorDesc output_desc(
-        transformed_input_grad,
-        CNNL_LAYOUT_NHWC,
-        ToCnnlDataType(transformed_input_grad.dtype()));
-    MLUCnnl::InterpBackward(ctx,
-                            GetMLUCnnlInterpBackwardMode(interp_method),
-                            align_corners,
-                            align_center,
-                            input_desc.get(),
-                            GetBasePtr(&transformed_output_grad),
-                            output_desc.get(),
-                            GetBasePtr(&transformed_input_grad));
-    if (need_transpose) {
-      const std::vector<int> perm = {0, 3, 1, 2};
-      MLUCnnlTensorDesc output_reshape_desc(
-          *input_grad, CNNL_LAYOUT_NCHW, ToCnnlDataType(input_grad->dtype()));
-      MLUCnnl::Transpose(ctx,
-                         perm,
-                         dim_in_trans_grad.size(),
-                         output_desc.get(),
-                         GetBasePtr(&transformed_input_grad),
-                         output_reshape_desc.get(),
-                         GetBasePtr(input_grad));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(bilinear_interp_v2,
-                       ops::InterpolateV2MLUKernel<float>,
-                       ops::InterpolateV2MLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(nearest_interp_v2,
-                       ops::InterpolateV2MLUKernel<float>,
-                       ops::InterpolateV2MLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(nearest_interp_v2_grad,
-                       ops::InterpolateV2GradMLUKernel<float>,
-                       ops::InterpolateV2GradMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(bilinear_interp_v2_grad,
-                       ops::InterpolateV2GradMLUKernel<float>,
-                       ops::InterpolateV2GradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/label_smooth_op_mlu.cc
+++ b/paddle/fluid/operators/label_smooth_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class LabelSmoothMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<phi::DenseTensor>("X");
-    auto* dist_t = ctx.Input<phi::DenseTensor>("PriorDist");
-    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto epsilon_gt = 1.0f - epsilon;
-    if (in_t->numel() == 0) return;
-    out_t->mutable_data<T>(ctx.GetPlace());
-    auto label_dim = in_t->dims()[in_t->dims().size() - 1];
-    MLUCnnlTensorDesc x_desc(*in_t);
-    MLUCnnlTensorDesc out_desc(*out_t);
-    auto data_type = ToCnnlDataType<T>();
-    MLUCnnlOpTensorDesc op_tensor_desc(
-        CNNL_OP_TENSOR_ADD, data_type, CNNL_NOT_PROPAGATE_NAN);
-    if (ctx.HasInput("PriorDist")) {
-      MLUCnnlTensorDesc dist_desc(*dist_t);
-      MLUCnnl::OpTensor(ctx,
-                        op_tensor_desc.get(),
-                        x_desc.get(),
-                        GetBasePtr(in_t),
-                        dist_desc.get(),
-                        GetBasePtr(dist_t),
-                        out_desc.get(),
-                        GetBasePtr(out_t),
-                        data_type,
-                        epsilon_gt,
-                        epsilon);
-    } else {
-      auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
-      phi::DenseTensor dist_tensor =
-          ctx.AllocateTmpTensor<T, MLUDeviceContext>({1, label_dim}, dev_ctx);
-      MLUCnnlTensorDesc dist_desc(dist_tensor);
-      auto value = static_cast<T>(1.0f / label_dim);
-      MLUCnnl::Fill(ctx,
-                    CNNL_POINTER_MODE_HOST,
-                    &value,
-                    dist_desc.get(),
-                    GetBasePtr(&dist_tensor));
-      MLUCnnl::OpTensor(ctx,
-                        op_tensor_desc.get(),
-                        x_desc.get(),
-                        GetBasePtr(in_t),
-                        dist_desc.get(),
-                        GetBasePtr(&dist_tensor),
-                        out_desc.get(),
-                        GetBasePtr(out_t),
-                        data_type,
-                        epsilon_gt,
-                        epsilon);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(label_smooth,
-                       ops::LabelSmoothMLUKernel<float>,
-                       ops::LabelSmoothMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/layer_norm_op_mlu.cc
+++ b/paddle/fluid/operators/layer_norm_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-using DDim = framework::DDim;
-template <typename T>
-class LayerNormMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    const auto epsilon = ctx.Attr<float>("epsilon");
-    const auto* x = ctx.Input<phi::DenseTensor>("X");
-    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
-    auto* variance = ctx.Output<phi::DenseTensor>("Variance");
-    auto place = ctx.GetPlace();
-    y->mutable_data<T>(place);
-    mean->mutable_data<T>(place);
-    variance->mutable_data<T>(place);
-    const auto& x_dims = x->dims();
-    std::vector<int> scale_bias_axes;
-    std::vector<int> mean_var_axes;
-    for (auto i = 0; i < x_dims.size(); ++i) {
-      if (i >= begin_norm_axis) {
-        scale_bias_axes.push_back(x_dims[i]);
-      } else {
-        mean_var_axes.push_back(x_dims[i]);
-      }
-    }
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnlTensorDesc mean_var_desc(
-        mean_var_axes.size(), mean_var_axes.data(), ToCnnlDataType<T>());
-    // cnnl only support both of scale and bias is NULL or not.
-    if (!scale && !bias) {
-      MLUCnnl::LayerNormForward(ctx,
-                                begin_norm_axis,
-                                x_desc.get(),
-                                GetBasePtr(x),
-                                nullptr /*scale_bias_desc*/,
-                                nullptr /*scale*/,
-                                nullptr /*bias*/,
-                                epsilon,
-                                y_desc.get(),
-                                GetBasePtr(y),
-                                mean_var_desc.get(),
-                                GetBasePtr(mean),
-                                GetBasePtr(variance));
-    } else {
-      phi::DenseTensor tmp_scale(x->dtype());
-      if (!scale) {
-        tmp_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
-        FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &tmp_scale);
-      } else {
-        tmp_scale = *scale;
-      }
-      phi::DenseTensor tmp_bias(x->dtype());
-      if (!bias) {
-        tmp_bias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
-        FillMLUTensorWithHostValue(ctx, static_cast<T>(0), &tmp_bias);
-      } else {
-        tmp_bias = *bias;
-      }
-      // scale and bias should have same type with x/y
-      MLUCnnlTensorDesc float32_desc(
-          scale_bias_axes.size(), scale_bias_axes.data(), CNNL_DTYPE_FLOAT);
-      MLUCnnlTensorDesc float16_desc(
-          scale_bias_axes.size(), scale_bias_axes.data(), CNNL_DTYPE_HALF);
-      cnnlCastDataType_t cast_type = GetCastDataType(VT::FP32, VT::FP16);
-      phi::DenseTensor final_scale(x->dtype());
-      if (final_scale.dtype() == DataType::FLOAT16 &&
-          tmp_scale.dtype() == DataType::FLOAT32) {
-        final_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
-        // cast scale to fp16
-        MLUCnnl::Cast(ctx,
-                      cast_type,
-                      float32_desc.get(),
-                      GetBasePtr(&tmp_scale),
-                      float16_desc.get(),
-                      GetBasePtr(&final_scale));
-      } else {
-        final_scale = tmp_scale;
-      }
-      phi::DenseTensor final_bias(x->dtype());
-      if (final_bias.dtype() == DataType::FLOAT16 &&
-          tmp_bias.dtype() == DataType::FLOAT32) {
-        final_bias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
-        // cast bias to fp16
-        MLUCnnl::Cast(ctx,
-                      cast_type,
-                      float32_desc.get(),
-                      GetBasePtr(&tmp_bias),
-                      float16_desc.get(),
-                      GetBasePtr(&final_bias));
-      } else {
-        final_bias = tmp_bias;
-      }
-      MLUCnnlTensorDesc scale_bias_desc(
-          scale_bias_axes.size(), scale_bias_axes.data(), ToCnnlDataType<T>());
-      MLUCnnl::LayerNormForward(ctx,
-                                begin_norm_axis,
-                                x_desc.get(),
-                                GetBasePtr(x),
-                                scale_bias_desc.get(),
-                                GetBasePtr(&final_scale),
-                                GetBasePtr(&final_bias),
-                                epsilon,
-                                y_desc.get(),
-                                GetBasePtr(y),
-                                mean_var_desc.get(),
-                                GetBasePtr(mean),
-                                GetBasePtr(variance));
-    }
-  }
-};
-template <typename T>
-class LayerNormGradMLUKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    const auto* x = ctx.Input<phi::DenseTensor>("X");
-    const auto* mean = ctx.Input<phi::DenseTensor>("Mean");
-    const auto* variance = ctx.Input<phi::DenseTensor>("Variance");
-    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dscale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto* dbias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-    auto place = ctx.GetPlace();
-    dx->mutable_data<T>(place);
-    const auto& x_dims = x->dims();
-    std::vector<int> scale_bias_axes;
-    std::vector<int> mean_var_axes;
-    for (auto i = 0; i < x_dims.size(); ++i) {
-      if (i >= begin_norm_axis) {
-        scale_bias_axes.push_back(x_dims[i]);
-      } else {
-        mean_var_axes.push_back(x_dims[i]);
-      }
-    }
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc dy_desc(*dy);
-    MLUCnnlTensorDesc mean_var_desc(
-        mean_var_axes.size(), mean_var_axes.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc dx_desc(*dx);
-    phi::DenseTensor tmp_scale(x->dtype());
-    if (!scale) {
-      tmp_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
-      FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &tmp_scale);
-    } else {
-      tmp_scale = *scale;
-    }
-    MLUCnnlTensorDesc float32_desc(
-        scale_bias_axes.size(), scale_bias_axes.data(), CNNL_DTYPE_FLOAT);
-    MLUCnnlTensorDesc float16_desc(
-        scale_bias_axes.size(), scale_bias_axes.data(), CNNL_DTYPE_HALF);
-    cnnlCastDataType_t cast_fp32_to_fp16 = GetCastDataType(VT::FP32, VT::FP16);
-    cnnlCastDataType_t cast_fp16_to_fp32 = GetCastDataType(VT::FP16, VT::FP32);
-    phi::DenseTensor final_scale(x->dtype());
-    if (final_scale.dtype() == DataType::FLOAT16 &&
-        tmp_scale.dtype() == DataType::FLOAT32) {
-      final_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
-      // cast scale to fp16
-      MLUCnnl::Cast(ctx,
-                    cast_fp32_to_fp16,
-                    float32_desc.get(),
-                    GetBasePtr(&tmp_scale),
-                    float16_desc.get(),
-                    GetBasePtr(&final_scale));
-    } else {
-      final_scale = tmp_scale;
-    }
-    phi::DenseTensor tmp_dscale(x->dtype());
-    if (dscale && (tmp_dscale.dtype() == dscale->dtype())) {
-      dscale->mutable_data<T>(place);
-      tmp_dscale = *dscale;
-    } else {
-      tmp_dscale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
-    }
-    phi::DenseTensor tmp_dbias(x->dtype());
-    if (dbias && (tmp_dbias.dtype() == dbias->dtype())) {
-      dbias->mutable_data<T>(place);
-      tmp_dbias = *dbias;
-    } else {
-      tmp_dbias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
-    }
-    MLUCnnlTensorDesc scale_desc(
-        scale_bias_axes.size(), scale_bias_axes.data(), ToCnnlDataType<T>());
-    MLUCnnl::LayerNormBackward(ctx,
-                               begin_norm_axis,
-                               x_desc.get(),
-                               GetBasePtr(x),
-                               dy_desc.get(),
-                               GetBasePtr(dy),
-                               scale_desc.get(),
-                               GetBasePtr(&final_scale),
-                               mean_var_desc.get(),
-                               GetBasePtr(mean),
-                               GetBasePtr(variance),
-                               dx_desc.get(),
-                               GetBasePtr(dx),
-                               GetBasePtr(&tmp_dscale),
-                               GetBasePtr(&tmp_dbias));
-    if (dscale && (tmp_dscale.dtype() == DataType::FLOAT16 &&
-                   dscale->dtype() == DataType::FLOAT32)) {
-      dscale->mutable_data<MPDType>(place);
-      MLUCnnl::Cast(ctx,
-                    cast_fp16_to_fp32,
-                    float16_desc.get(),
-                    GetBasePtr(&tmp_dscale),
-                    float32_desc.get(),
-                    GetBasePtr(dscale));
-    }
-    if (dbias && (tmp_dbias.dtype() == DataType::FLOAT16 &&
-                  dbias->dtype() == DataType::FLOAT32)) {
-      dbias->mutable_data<MPDType>(place);
-      MLUCnnl::Cast(ctx,
-                    cast_fp16_to_fp32,
-                    float16_desc.get(),
-                    GetBasePtr(&tmp_dbias),
-                    float32_desc.get(),
-                    GetBasePtr(dbias));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(layer_norm,
-                       ops::LayerNormMLUKernel<float>,
-                       ops::LayerNormMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(layer_norm_grad,
-                       ops::LayerNormGradMLUKernel<float>,
-                       ops::LayerNormGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class LookupTableV2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *ids_t = ctx.Input<phi::DenseTensor>("Ids");      // int tensor
-    auto *output_t = ctx.Output<phi::DenseTensor>("Out");  // float tensor
-    auto *table_t = ctx.Input<phi::DenseTensor>("W");
-    int padding_idx = static_cast<int>(ctx.Attr<int64_t>("padding_idx"));
-    auto *table_var = ctx.InputVar("W");
-    PADDLE_ENFORCE_EQ(
-        table_var->IsType<phi::DenseTensor>(),
-        true,
-        platform::errors::InvalidArgument("mlu only accept phi::DenseTensor"));
-    output_t->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc ids_desc(*ids_t);
-    MLUCnnlTensorDesc table_desc(*table_t);
-    MLUCnnlTensorDesc output_desc(*output_t);
-    MLUCnnl::EmbeddingForward(ctx,
-                              padding_idx,
-                              table_desc.get(),
-                              GetBasePtr(table_t),
-                              ids_desc.get(),
-                              static_cast<const int *>(GetBasePtr(ids_t)),
-                              output_desc.get(),
-                              GetBasePtr(output_t));
-  }
-};
-template <typename T>
-class LookupTableV2GradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *table_var = ctx.InputVar("W");
-    PADDLE_ENFORCE_EQ(
-        table_var->IsType<phi::DenseTensor>(),
-        true,
-        platform::errors::PermissionDenied(
-            "Unsupported Variable Type , idx in "
-            "LookupTableV2GradMLUKernel should be phi::DenseTensor."));
-    bool is_sparse = ctx.Attr<bool>("is_sparse");
-    PADDLE_ENFORCE_EQ(
-        is_sparse,
-        false,
-        platform::errors::InvalidArgument(
-            "LookupTableV2GradMLUKernel dose NOT support is_sparse = True."));
-    auto *ids_t = ctx.Input<phi::DenseTensor>("Ids");
-    auto *output_grad_t =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *table_grad_t =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("W"));
-    table_grad_t->mutable_data<T>(ctx.GetPlace());
-    int padding_idx = static_cast<int>(ctx.Attr<int64_t>("padding_idx"));
-    int64_t ids_numel = ids_t->numel();
-    PADDLE_ENFORCE_EQ(
-        ids_numel <= std::numeric_limits<int32_t>::max(),
-        true,
-        platform::errors::OutOfRange(
-            "Number of ids greater than int32_t::max , please check "
-            "number of ids in LookupTableV2GradMLUKernel."));
-    phi::DenseTensor ids_int32(ids_t->dtype());
-    if (ids_t->dtype() != DataType::INT32) {
-      ids_int32.mutable_data<int>(ids_t->dims(), ctx.GetPlace());
-      MLUCnnlTensorDesc ids_desc(*ids_t);
-      MLUCnnlTensorDesc ids_int32_desc(ids_int32);
-      auto cast_type = GetCastDataType(ids_t->dtype(), DataType::INT32);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    ids_desc.get(),
-                    GetBasePtr(ids_t),
-                    ids_int32_desc.get(),
-                    GetBasePtr(&ids_int32));
-    } else {
-      ids_int32 = *ids_t;
-    }
-    MLUCnnlTensorDesc ids_int32_desc(ids_int32);
-    MLUCnnlTensorDesc output_grad_desc(*output_grad_t);
-    MLUCnnlTensorDesc table_grad_desc(*table_grad_t);
-    MLUCnnl::EmbeddingBackward(ctx,
-                               padding_idx,
-                               false,
-                               ids_int32_desc.get(),
-                               GetBasePtr(&ids_int32),
-                               output_grad_desc.get(),
-                               GetBasePtr(output_grad_t),
-                               table_grad_desc.get(),
-                               GetBasePtr(table_grad_t));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(lookup_table_v2,
-                       ops::LookupTableV2MLUKernel<float>,
-                       ops::LookupTableV2MLUKernel<int>,
-                       ops::LookupTableV2MLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(lookup_table_v2_grad,
-                       ops::LookupTableV2GradMLUKernel<float>,
-                       ops::LookupTableV2GradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/masked_select_op_mlu.cc
+++ b/paddle/fluid/operators/masked_select_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class MaskedSelectedMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<phi::DenseTensor>("X");
-    auto mask = ctx.Input<phi::DenseTensor>("Mask");
-    auto out = ctx.Output<phi::DenseTensor>("Y");
-    auto input_dim = input->dims();
-    auto mask_dim = mask->dims();
-    PADDLE_ENFORCE_EQ(
-        input_dim,
-        mask_dim,
-        platform::errors::InvalidArgument(
-            "The dim size of input and mask in OP(masked_selected) "
-            "must be equal, but got input dim:(%ld), mask dim: "
-            "(%ld). Please check input "
-            "value.",
-            input_dim,
-            mask_dim));
-    phi::DenseTensor number(framework::TransToPhiDataType(VT::INT32));
-    void* number_ptr = number.mutable_data<int32_t>({1}, ctx.GetPlace());
-    out->Resize(mask->dims());
-    out->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc mask_desc(*mask);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::Mask(ctx,
-                  CNNL_MASKED_SELECT,
-                  input_desc.get(),
-                  GetBasePtr(input),
-                  mask_desc.get(),
-                  GetBasePtr(mask),
-                  nullptr,
-                  nullptr,
-                  out_desc.get(),
-                  GetBasePtr(out),
-                  static_cast<uint32_t*>(number_ptr));
-  }
-};
-template <typename T>
-class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto mask = ctx.Input<phi::DenseTensor>("Mask");
-    auto y_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    phi::DenseTensor mask_int32, out_size;
-    std::vector<int32_t> out_size_vec;
-    mask_int32.mutable_data<int32_t>(mask->dims(), ctx.GetPlace());
-    out_size.mutable_data<int32_t>({1}, ctx.GetPlace());
-    MLUCnnlTensorDesc mask_desc(*mask);
-    MLUCnnlTensorDesc mask_int32_desc(mask_int32);
-    MLUCnnlTensorDesc out_size_desc(out_size);
-    auto cast_type = GetCastDataType(mask->dtype(), DataType::INT32);
-    MLUCnnl::Cast(ctx,
-                  cast_type,
-                  mask_desc.get(),
-                  GetBasePtr(mask),
-                  mask_int32_desc.get(),
-                  GetBasePtr(&mask_int32));
-    auto mask_int32_dim = phi::vectorize(mask_int32.dims());
-    std::vector<int32_t> reduce_dims;
-    for (size_t i = 0; i < mask_int32_dim.size(); i++) {
-      reduce_dims.push_back(static_cast<int>(i));
-    }
-    std::string reduce_name = "reduce_sum";
-    cnnlReduceOp_t reduce_op = GetMLUCnnlReduceOp(reduce_name);
-    MLUCnnlReduceDesc reduce_desc(reduce_dims,
-                                  reduce_op,
-                                  ToCnnlDataType<int32_t>(),
-                                  CNNL_NOT_PROPAGATE_NAN,
-                                  CNNL_REDUCE_NO_INDICES,
-                                  CNNL_32BIT_INDICES);
-    MLUCnnl::Reduce(ctx,
-                    true,
-                    reduce_desc.get(),
-                    nullptr,
-                    mask_int32_desc.get(),
-                    GetBasePtr(&mask_int32),
-                    0,
-                    nullptr,
-                    nullptr,
-                    out_size_desc.get(),
-                    GetBasePtr(&out_size));
-    paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec);
-    dev_ctx.Wait();
-    phi::DenseTensor mask_int32_tmp;
-    mask_int32_tmp.ShareDataWith(mask_int32);
-    mask_int32_tmp.Resize({mask_int32.numel()});
-    phi::DenseTensor topk_v2_out(framework::TransToPhiDataType(VT::INT32)),
-        indices_int32(framework::TransToPhiDataType(VT::INT32));
-    topk_v2_out.mutable_data<int32_t>({mask_int32.numel()}, ctx.GetPlace());
-    indices_int32.mutable_data<int32_t>({mask_int32.numel()}, ctx.GetPlace());
-    MLUCnnlTensorDesc topk_v2_out_desc(topk_v2_out);
-    MLUCnnlTensorDesc indices_int32_desc(indices_int32);
-    MLUCnnlTensorDesc mask_int32_tmp_desc(mask_int32_tmp);
-    const int dim = 0;
-    MLUCnnl::TopK(ctx,
-                  mask_int32.numel(),
-                  dim,
-                  true,
-                  false,
-                  mask_int32_tmp_desc.get(),
-                  GetBasePtr(&mask_int32_tmp),
-                  topk_v2_out_desc.get(),
-                  GetBasePtr(&topk_v2_out),
-                  indices_int32_desc.get(),
-                  GetBasePtr(&indices_int32));
-    auto stream = ctx.template device_context<MLUDeviceContext>().stream();
-    phi::DenseTensor indices_int32_out;
-    indices_int32_out.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
-    memory::Copy(ctx.GetPlace(),
-                 GetBasePtr(&indices_int32_out),
-                 ctx.GetPlace(),
-                 GetBasePtr(&indices_int32),
-                 out_size_vec[0] * sizeof(int32_t),
-                 stream);
-    phi::DenseTensor y_grad_tmp_out;
-    y_grad_tmp_out.mutable_data<T>({out_size_vec[0]}, ctx.GetPlace());
-    MLUCnnlTensorDesc y_grad_tmp_out_desc(y_grad_tmp_out);
-    memory::Copy(ctx.GetPlace(),
-                 GetBasePtr(&y_grad_tmp_out),
-                 ctx.GetPlace(),
-                 GetBasePtr(y_grad),
-                 out_size_vec[0] * sizeof(T),
-                 stream);
-    phi::DenseTensor indices_int32_tmp;
-    indices_int32_tmp.ShareDataWith(indices_int32_out);
-    indices_int32_tmp.Resize({out_size_vec[0], 1});
-    MLUCnnlTensorDesc indices_int32_tmp_desc(indices_int32_tmp);
-    const cnnlScatterNdMode_t mode = CNNL_SCATTERND_UPDATE;
-    x_grad->Resize({x_grad->numel()});
-    x_grad->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc x_grad_desc(*x_grad);
-    MLUCnnl::ScatterNd(ctx,
-                       mode,
-                       indices_int32_tmp_desc.get(),
-                       GetBasePtr(&indices_int32_tmp),
-                       y_grad_tmp_out_desc.get(),
-                       GetBasePtr(&y_grad_tmp_out),
-                       nullptr,
-                       nullptr,
-                       x_grad_desc.get(),
-                       GetBasePtr(x_grad));
-    x_grad->Resize(mask->dims());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(masked_select,
-                       ops::MaskedSelectedMLUKernel<float>,
-                       ops::MaskedSelectedMLUKernel<int>,
-                       ops::MaskedSelectedMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(masked_select_grad,
-                       ops::MaskedSelectedGradMLUKernel<float>,
-                       ops::MaskedSelectedGradMLUKernel<int>,
-                       ops::MaskedSelectedGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/matmul_op_mlu.cc
+++ b/paddle/fluid/operators/matmul_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-static void Mul(const framework::ExecutionContext& ctx,
-                const phi::DenseTensor& X,
-                const phi::DenseTensor& Y,
-                phi::DenseTensor* Out,
-                const float alpha) {
-  Out->mutable_data<T>(ctx.GetPlace());
-  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlOpTensorDesc mul_op_desc(
-      CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-  MLUCnnl::OpTensor(ctx,
-                    mul_op_desc.get(),
-                    x_desc.get(),
-                    GetBasePtr(&X),
-                    y_desc.get(),
-                    GetBasePtr(&Y),
-                    out_desc.get(),
-                    GetBasePtr(Out),
-                    ToCnnlDataType<T>(),
-                    alpha);
-}
-template <typename T>
-static void MatMul2D(const framework::ExecutionContext& ctx,
-                     const phi::DenseTensor& X,
-                     const phi::DenseTensor& Y,
-                     phi::DenseTensor* Out,
-                     const bool trans_x,
-                     const bool trans_y,
-                     const float alpha) {
-  Out->mutable_data<T>(ctx.GetPlace());
-  PADDLE_ENFORCE_LT(fabs(alpha - 1.0),
-                    std::numeric_limits<float>::epsilon(),
-                    platform::errors::InvalidArgument(
-                        "MLU(matmul): alpha should be equal to 1.0! "
-                        "Other values are not supported yet."
-                        "But received alpha is %d.",
-                        alpha));
-  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnl::Matmul(ctx,
-                  trans_x,
-                  trans_y,
-                  x_desc.get(),
-                  GetBasePtr(&X),
-                  y_desc.get(),
-                  GetBasePtr(&Y),
-                  out_desc.get(),
-                  GetBasePtr(Out));
-}
-template <typename T>
-static void MatMulND(const framework::ExecutionContext& ctx,
-                     const phi::DenseTensor& X,
-                     const phi::DenseTensor& Y,
-                     phi::DenseTensor* Out,
-                     const bool trans_x,
-                     const bool trans_y,
-                     const float alpha) {
-  if (!Out->initialized()) {
-    Out->mutable_data<T>(ctx.GetPlace());
-  }
-  PADDLE_ENFORCE_LT(fabs(alpha - 1.0),
-                    std::numeric_limits<float>::epsilon(),
-                    platform::errors::InvalidArgument(
-                        "MLU(matmul): alpha should be equal to 1.0! "
-                        "Other values are not supported yet."
-                        "But received alpha is %d.",
-                        alpha));
-  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnl::BatchMatmul(ctx,
-                       trans_x,
-                       trans_y,
-                       x_desc.get(),
-                       GetBasePtr(&X),
-                       y_desc.get(),
-                       GetBasePtr(&Y),
-                       out_desc.get(),
-                       GetBasePtr(Out));
-}
-template <typename T>
-static void ReduceDims(const framework::ExecutionContext& ctx,
-                       const std::vector<int64_t>& dims,
-                       const std::vector<int64_t>& bcast_dims,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  std::vector<int64_t> axes;
-  int64_t size = bcast_dims.size();
-  int64_t diff = bcast_dims.size() - dims.size();
-  for (int64_t i = 0; i < size; ++i) {
-    if (i < diff) {
-      axes.push_back(i);
-      continue;
-    }
-    if (bcast_dims[i] > dims[i - diff]) {
-      axes.push_back(i);
-    }
-  }
-  out->mutable_data<T>(ctx.GetPlace());
-  MLUCnnlTensorDesc in_desc(in, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  std::vector<int> reduce_dims(axes.begin(), axes.end());
-  MLUCnnlReduceDesc reduce_desc(reduce_dims,
-                                CNNL_REDUCE_ADD,
-                                ToCnnlDataType<T>(),
-                                CNNL_NOT_PROPAGATE_NAN,
-                                CNNL_REDUCE_NO_INDICES,
-                                CNNL_32BIT_INDICES);
-  MLUCnnl::Reduce(ctx,
-                  true /*need_workspace*/,
-                  reduce_desc.get(),
-                  nullptr,
-                  in_desc.get(),
-                  GetBasePtr(&in),
-                  0 /*indices_size*/,
-                  nullptr,
-                  nullptr,
-                  out_desc.get(),
-                  GetBasePtr(out));
-}
-template <typename T>
-class MatMulMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-    bool transpose_x = ctx.Attr<bool>("transpose_X");
-    bool transpose_y = ctx.Attr<bool>("transpose_Y");
-    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
-    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(Out->dims());
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    // Case 1: [K] x [K] = [1]
-    // Equal: [1, K] x [K, 1] = [1, 1] => [1]
-    const bool all_one_dim = (x_ndim == 1 && y_ndim == 1);
-    if (all_one_dim) {
-      Out->Resize({1, 1});
-    }
-    // Resize dim 1 to 2
-    phi::DenseTensor x_temp, y_temp;
-    x_temp.ShareDataWith(*X);
-    y_temp.ShareDataWith(*Y);
-    if (x_ndim == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-      x_temp.Resize(phi::make_ddim(x_dims));
-      x_ndim = 2;
-      // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
-      if (out_dims.size() < y_dims.size()) {
-        std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
-        temp_out_dims.insert(temp_out_dims.end() - 1, 1);
-        Out->Resize(phi::make_ddim(temp_out_dims));
-      }
-    }
-    if (y_ndim == 1) {
-      y_dims.push_back(1);
-      y_temp.Resize(phi::make_ddim(y_dims));
-      y_ndim = 2;
-      // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
-      if (out_dims.size() < x_dims.size()) {
-        std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
-        temp_out_dims.push_back(1);
-        Out->Resize(phi::make_ddim(temp_out_dims));
-      }
-    }
-    const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-    if (transpose_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          K,
-          platform::errors::InvalidArgument("Input(Y) has error dim."
-                                            "Y'dims[%d] must be equal to %d"
-                                            "But received Y'dims[%d] is %d",
-                                            y_ndim - 1,
-                                            K,
-                                            y_ndim - 1,
-                                            y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          K,
-          platform::errors::InvalidArgument("Input(Y) has error dim."
-                                            "Y'dims[%d] must be equal to %d"
-                                            "But received Y'dims[%d] is %d",
-                                            y_ndim - 2,
-                                            K,
-                                            y_ndim - 2,
-                                            y_dims[y_ndim - 2]));
-    }
-    if (x_ndim == 2 && y_ndim == 2) {
-      // Case 2: [M, K] x [K, N] = [M, N]
-      MatMul2D<T>(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
-    } else {
-      // Case 3: [B, M, K] x [K, N] =  [B, M, N]
-      // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
-      MatMulND<T>(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
-    }
-    if (phi::vectorize(Out->dims()) != out_dims) {
-      Out->Resize(phi::make_ddim(out_dims));
-    }
-  }
-};
-template <typename T>
-class MatMulGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dY = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    bool transpose_x = ctx.Attr<bool>("transpose_X");
-    bool transpose_y = ctx.Attr<bool>("transpose_Y");
-    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
-    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(dOut->dims());
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int out_ndim = out_dims.size();
-    // Case 1: [K] x [K] = [1]
-    if (x_ndim == 1 && y_ndim == 1) {
-      if (dX) {
-        Mul<T>(ctx, *dOut, *Y, dX, alpha);
-      }
-      if (dY) {
-        Mul<T>(ctx, *dOut, *X, dY, alpha);
-      }
-      return;
-    }
-    // Resize dim 1 to 2
-    phi::DenseTensor x_temp, y_temp, dout_temp;
-    x_temp.ShareDataWith(*X);
-    y_temp.ShareDataWith(*Y);
-    dout_temp.ShareDataWith(*dOut);
-    if (x_ndim == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-      out_dims.insert(out_dims.end() - 1, 1);
-      x_temp.Resize(phi::make_ddim(x_dims));
-      dout_temp.Resize(phi::make_ddim(out_dims));
-      x_ndim = 2;
-      out_ndim += 1;
-    }
-    if (y_ndim == 1) {
-      y_dims.push_back(1);
-      out_dims.push_back(1);
-      y_temp.Resize(phi::make_ddim(y_dims));
-      dout_temp.Resize(phi::make_ddim(out_dims));
-      y_ndim = 2;
-      out_ndim += 1;
-    }
-    // Case 2: [M, K] x [K, N] = [M, N]
-    if (out_ndim == 2) {
-      if (dX) {
-        dX->Resize(phi::make_ddim(x_dims));
-        if (transpose_x) {
-          MatMul2D<T>(ctx, y_temp, dout_temp, dX, transpose_y, true, alpha);
-        } else {
-          MatMul2D<T>(ctx, dout_temp, y_temp, dX, false, !transpose_y, alpha);
-        }
-        dX->Resize(X->dims());
-      }
-      if (dY) {
-        dY->Resize(phi::make_ddim(y_dims));
-        if (transpose_y) {
-          MatMul2D<T>(ctx, dout_temp, x_temp, dY, true, transpose_x, alpha);
-        } else {
-          MatMul2D<T>(ctx, x_temp, dout_temp, dY, !transpose_x, false, alpha);
-        }
-        dY->Resize(Y->dims());
-      }
-      return;
-    }
-    // Case 3: [B, M, K] x [K, N] =  [B, M, N]
-    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
-    std::vector<int64_t> x_bcast_dims(out_ndim, 1);
-    std::vector<int64_t> y_bcast_dims(out_ndim, 1);
-    std::copy(out_dims.begin(), out_dims.end() - 2, x_bcast_dims.begin());
-    std::copy(out_dims.begin(), out_dims.end() - 2, y_bcast_dims.begin());
-    std::copy(x_dims.end() - 2, x_dims.end(), x_bcast_dims.end() - 2);
-    std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2);
-    if (dX) {
-      phi::DenseTensor dx_temp(X->type());
-      if (x_dims != x_bcast_dims) {
-        dx_temp.Resize(phi::make_ddim(x_bcast_dims));
-      } else {
-        dX->mutable_data<T>(ctx.GetPlace());
-        dx_temp.ShareDataWith(*dX);
-      }
-      if (transpose_x) {
-        MatMulND<T>(ctx, y_temp, dout_temp, &dx_temp, transpose_y, true, alpha);
-      } else {
-        MatMulND<T>(
-            ctx, dout_temp, y_temp, &dx_temp, false, !transpose_y, alpha);
-      }
-      if (x_dims != x_bcast_dims) {
-        ReduceDims<T>(ctx, x_dims, x_bcast_dims, dx_temp, dX);
-      }
-    }
-    if (dY) {
-      phi::DenseTensor dy_temp(Y->type());
-      if (y_dims != y_bcast_dims) {
-        dy_temp.Resize(phi::make_ddim(y_bcast_dims));
-      } else {
-        dY->mutable_data<T>(ctx.GetPlace());
-        dy_temp.ShareDataWith(*dY);
-      }
-      if (transpose_y) {
-        MatMulND<T>(ctx, dout_temp, x_temp, &dy_temp, true, transpose_x, alpha);
-      } else {
-        MatMulND<T>(
-            ctx, x_temp, dout_temp, &dy_temp, !transpose_x, false, alpha);
-      }
-      if (y_dims != y_bcast_dims) {
-        ReduceDims<T>(ctx, y_dims, y_bcast_dims, dy_temp, dY);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(matmul,
-                       ops::MatMulMLUKernel<float>,
-                       ops::MatMulMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(matmul_grad,
-                       ops::MatMulGradMLUKernel<float>,
-                       ops::MatMulGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/matmul_v2_op_mlu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/matmul_v2_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-static void Mul(const framework::ExecutionContext& ctx,
-                const phi::DenseTensor& X,
-                const phi::DenseTensor& Y,
-                phi::DenseTensor* Out) {
-  Out->mutable_data<T>(ctx.GetPlace());
-  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlOpTensorDesc mul_op_desc(
-      CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-  MLUCnnl::OpTensor(ctx,
-                    mul_op_desc.get(),
-                    x_desc.get(),
-                    GetBasePtr(&X),
-                    y_desc.get(),
-                    GetBasePtr(&Y),
-                    out_desc.get(),
-                    GetBasePtr(Out),
-                    ToCnnlDataType<T>());
-}
-template <typename T>
-static void MatMul2D(const framework::ExecutionContext& ctx,
-                     const phi::DenseTensor& X,
-                     const phi::DenseTensor& Y,
-                     phi::DenseTensor* Out,
-                     const bool trans_x,
-                     const bool trans_y) {
-  Out->mutable_data<T>(ctx.GetPlace());
-  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnl::Matmul(ctx,
-                  trans_x,
-                  trans_y,
-                  x_desc.get(),
-                  GetBasePtr(&X),
-                  y_desc.get(),
-                  GetBasePtr(&Y),
-                  out_desc.get(),
-                  GetBasePtr(Out));
-}
-template <typename T>
-static void MatMul2DwithReduceBatch(const framework::ExecutionContext& ctx,
-                                    const phi::DenseTensor& X,
-                                    const phi::DenseTensor& Y,
-                                    phi::DenseTensor* Out,
-                                    const bool trans_x,
-                                    const bool trans_y) {
-  if (!Out->initialized()) {
-    Out->mutable_data<T>(ctx.GetPlace());
-  }
-  // reshape to 2D matmul
-  std::vector<int64_t> x_dims = phi::vectorize(X.dims());
-  std::vector<int64_t> y_dims = phi::vectorize(Y.dims());
-  std::vector<int> realx_dims(
-      {static_cast<int>(x_dims[0] * x_dims[1]), static_cast<int>(x_dims[2])});
-  std::vector<int> realy_dims(
-      {static_cast<int>(y_dims[0] * y_dims[1]), static_cast<int>(y_dims[2])});
-  MLUCnnlTensorDesc x_desc(2, realx_dims.data(), ToCnnlDataType<T>());
-  MLUCnnlTensorDesc y_desc(2, realy_dims.data(), ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnl::Matmul(ctx,
-                  trans_x,
-                  trans_y,
-                  x_desc.get(),
-                  GetBasePtr(&X),
-                  y_desc.get(),
-                  GetBasePtr(&Y),
-                  out_desc.get(),
-                  GetBasePtr(Out));
-}
-template <typename T>
-static void MatMulND(const framework::ExecutionContext& ctx,
-                     const phi::DenseTensor& X,
-                     const phi::DenseTensor& Y,
-                     phi::DenseTensor* Out,
-                     const bool trans_x,
-                     const bool trans_y) {
-  if (!Out->initialized()) {
-    Out->mutable_data<T>(ctx.GetPlace());
-  }
-  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnl::BatchMatmul(ctx,
-                       trans_x,
-                       trans_y,
-                       x_desc.get(),
-                       GetBasePtr(&X),
-                       y_desc.get(),
-                       GetBasePtr(&Y),
-                       out_desc.get(),
-                       GetBasePtr(Out));
-}
-template <typename T>
-static void ReduceDims(const framework::ExecutionContext& ctx,
-                       const std::vector<int64_t>& dims,
-                       const std::vector<int64_t>& bcast_dims,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  std::vector<int64_t> axes;
-  int64_t size = bcast_dims.size();
-  int64_t diff = bcast_dims.size() - dims.size();
-  for (int64_t i = 0; i < size; ++i) {
-    if (i < diff) {
-      axes.push_back(i);
-      continue;
-    }
-    if (bcast_dims[i] > dims[i - diff]) {
-      axes.push_back(i);
-    }
-  }
-  out->mutable_data<T>(ctx.GetPlace());
-  MLUCnnlTensorDesc in_desc(in, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-  std::vector<int> reduce_dims(axes.begin(), axes.end());
-  MLUCnnlReduceDesc reduce_desc(reduce_dims,
-                                CNNL_REDUCE_ADD,
-                                ToCnnlDataType<T>(),
-                                CNNL_NOT_PROPAGATE_NAN,
-                                CNNL_REDUCE_NO_INDICES,
-                                CNNL_32BIT_INDICES);
-  MLUCnnl::Reduce(ctx,
-                  true /*need_workspace*/,
-                  reduce_desc.get(),
-                  nullptr,
-                  in_desc.get(),
-                  GetBasePtr(&in),
-                  0 /*indices_size*/,
-                  nullptr,
-                  nullptr,
-                  out_desc.get(),
-                  GetBasePtr(out));
-}
-template <typename T>
-class MatMulV2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-    const bool trans_x = ctx.Attr<bool>("trans_x");
-    const bool trans_y = ctx.Attr<bool>("trans_y");
-    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(Out->dims());
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    // Case 1: [K] x [K] = [1]
-    // Equal: [1, K] x [K, 1] = [1, 1] => [1]
-    const bool all_one_dim = (x_ndim == 1 && y_ndim == 1);
-    if (all_one_dim) {
-      Out->Resize({1, 1});
-    }
-    // Resize dim 1 to 2
-    phi::DenseTensor x_temp, y_temp;
-    x_temp.ShareDataWith(*X);
-    y_temp.ShareDataWith(*Y);
-    if (x_ndim == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-      x_temp.Resize(phi::make_ddim(x_dims));
-      x_ndim = 2;
-      // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
-      if (out_dims.size() < y_dims.size()) {
-        std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
-        temp_out_dims.insert(temp_out_dims.end() - 1, 1);
-        Out->Resize(phi::make_ddim(temp_out_dims));
-      }
-    }
-    if (y_ndim == 1) {
-      y_dims.push_back(1);
-      y_temp.Resize(phi::make_ddim(y_dims));
-      y_ndim = 2;
-      // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
-      if (out_dims.size() < x_dims.size()) {
-        std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
-        temp_out_dims.push_back(1);
-        Out->Resize(phi::make_ddim(temp_out_dims));
-      }
-    }
-    const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          K,
-          platform::errors::InvalidArgument("Input(Y) has error dim."
-                                            "Y'dims[%d] must be equal to %d"
-                                            "But received Y'dims[%d] is %d",
-                                            y_ndim - 1,
-                                            K,
-                                            y_ndim - 1,
-                                            y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          K,
-          platform::errors::InvalidArgument("Input(Y) has error dim."
-                                            "Y'dims[%d] must be equal to %d"
-                                            "But received Y'dims[%d] is %d",
-                                            y_ndim - 2,
-                                            K,
-                                            y_ndim - 2,
-                                            y_dims[y_ndim - 2]));
-    }
-    if (x_ndim == 2 && y_ndim == 2) {
-      // Case 2: [M, K] x [K, N] = [M, N]
-      MatMul2D<T>(ctx, x_temp, y_temp, Out, trans_x, trans_y);
-    } else {
-      // Case 3: [B, M, K] x [K, N] =  [B, M, N]
-      // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
-      MatMulND<T>(ctx, x_temp, y_temp, Out, trans_x, trans_y);
-    }
-    if (phi::vectorize(Out->dims()) != out_dims) {
-      Out->Resize(phi::make_ddim(out_dims));
-    }
-  }
-};
-template <typename T>
-class MatMulGradV2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dY = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    const bool trans_x = ctx.Attr<bool>("trans_x");
-    const bool trans_y = ctx.Attr<bool>("trans_y");
-    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(dOut->dims());
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int out_ndim = out_dims.size();
-    // Case 1: [K] x [K] = [1]
-    if (x_ndim == 1 && y_ndim == 1) {
-      if (dX) {
-        Mul<T>(ctx, *dOut, *Y, dX);
-      }
-      if (dY) {
-        Mul<T>(ctx, *dOut, *X, dY);
-      }
-      return;
-    }
-    // Resize dim 1 to 2
-    phi::DenseTensor x_temp, y_temp, dout_temp;
-    x_temp.ShareDataWith(*X);
-    y_temp.ShareDataWith(*Y);
-    dout_temp.ShareDataWith(*dOut);
-    if (x_ndim == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-      out_dims.insert(out_dims.end() - 1, 1);
-      x_temp.Resize(phi::make_ddim(x_dims));
-      dout_temp.Resize(phi::make_ddim(out_dims));
-      x_ndim = 2;
-      out_ndim += 1;
-    }
-    if (y_ndim == 1) {
-      y_dims.push_back(1);
-      out_dims.push_back(1);
-      y_temp.Resize(phi::make_ddim(y_dims));
-      dout_temp.Resize(phi::make_ddim(out_dims));
-      y_ndim = 2;
-      out_ndim += 1;
-    }
-    // Case 2: [M, K] x [K, N] = [M, N]
-    if (out_ndim == 2) {
-      if (dX) {
-        dX->Resize(phi::make_ddim(x_dims));
-        if (trans_x) {
-          MatMul2D<T>(ctx, y_temp, dout_temp, dX, trans_y, true);
-        } else {
-          MatMul2D<T>(ctx, dout_temp, y_temp, dX, false, !trans_y);
-        }
-        dX->Resize(X->dims());
-      }
-      if (dY) {
-        dY->Resize(phi::make_ddim(y_dims));
-        if (trans_y) {
-          MatMul2D<T>(ctx, dout_temp, x_temp, dY, true, trans_x);
-        } else {
-          MatMul2D<T>(ctx, x_temp, dout_temp, dY, !trans_x, false);
-        }
-        dY->Resize(Y->dims());
-      }
-      return;
-    }
-    // Case 3: [B, M, K] x [K, N] =  [B, M, N]
-    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
-    std::vector<int64_t> x_bcast_dims(out_ndim, 1);
-    std::vector<int64_t> y_bcast_dims(out_ndim, 1);
-    std::copy(out_dims.begin(), out_dims.end() - 2, x_bcast_dims.begin());
-    std::copy(out_dims.begin(), out_dims.end() - 2, y_bcast_dims.begin());
-    std::copy(x_dims.end() - 2, x_dims.end(), x_bcast_dims.end() - 2);
-    std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2);
-    if (dX) {
-      phi::DenseTensor dx_temp(X->type());
-      if (x_dims != x_bcast_dims) {
-        dx_temp.Resize(phi::make_ddim(x_bcast_dims));
-      } else {
-        dX->mutable_data<T>(ctx.GetPlace());
-        dx_temp.ShareDataWith(*dX);
-      }
-      if (trans_x) {
-        MatMulND<T>(ctx, y_temp, dout_temp, &dx_temp, trans_y, true);
-      } else {
-        MatMulND<T>(ctx, dout_temp, y_temp, &dx_temp, false, !trans_y);
-      }
-      if (x_dims != x_bcast_dims) {
-        ReduceDims<T>(ctx, x_dims, x_bcast_dims, dx_temp, dX);
-      }
-    }
-    if (dY) {
-      // Case 3: [B, M, K] x [K, N] =  [B, M, N]  better performance
-      // otherwise, tensor dy_temp in else branch might encounter
-      // numel overflow due to cnnlTensorDescriptor limitation
-      if (x_dims.size() == 3 && phi::vectorize(Y->dims()).size() == 2) {
-        if (trans_y) {
-          MatMul2DwithReduceBatch<T>(ctx, dout_temp, x_temp, dY, true, trans_x);
-        } else {
-          MatMul2DwithReduceBatch<T>(
-              ctx, x_temp, dout_temp, dY, !trans_x, false);
-        }
-      } else {
-        phi::DenseTensor dy_temp(Y->type());
-        if (y_dims != y_bcast_dims) {
-          dy_temp.Resize(phi::make_ddim(y_bcast_dims));
-        } else {
-          dY->mutable_data<T>(ctx.GetPlace());
-          dy_temp.ShareDataWith(*dY);
-        }
-        if (trans_y) {
-          MatMulND<T>(ctx, dout_temp, x_temp, &dy_temp, true, trans_x);
-        } else {
-          MatMulND<T>(ctx, x_temp, dout_temp, &dy_temp, !trans_x, false);
-        }
-        if (y_dims != y_bcast_dims) {
-          ReduceDims<T>(ctx, y_dims, y_bcast_dims, dy_temp, dY);
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(matmul_v2,
-                       ops::MatMulV2MLUKernel<float>,
-                       ops::MatMulV2MLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(matmul_v2_grad,
-                       ops::MatMulGradV2MLUKernel<float>,
-                       ops::MatMulGradV2MLUKernel<plat::float16>);
--- a/paddle/fluid/operators/mean_op_mlu.cc
+++ b/paddle/fluid/operators/mean_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-#include "paddle/fluid/platform/float16.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class MeanMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<phi::DenseTensor>("X");
-    auto* output = context.Output<phi::DenseTensor>("Out");
-    const T* in_data = input->data<T>();
-    T* out_data = output->mutable_data<T>(context.GetPlace());
-    auto numel = input->numel();
-    auto rank = input->dims().size();
-    auto place = context.GetPlace();
-    auto stream = context.template device_context<MLUDeviceContext>().stream();
-    if (rank == 0) {  // scalar
-      memory::Copy(place, out_data, place, in_data, numel * sizeof(T), stream);
-      return;
-    }
-    std::vector<int> reduce_dims;
-    reduce_dims.reserve(rank);
-    for (decltype(rank) i = 0; i < rank; ++i) {
-      reduce_dims.push_back(i);
-    }
-    MLUCnnlTensorDesc input_desc(
-        *input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input->dtype()));
-    MLUCnnlTensorDesc output_desc(
-        *output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->dtype()));
-    MLUCnnlReduceDesc reduction_desc(reduce_dims,
-                                     CNNL_REDUCE_AVG,
-                                     ToCnnlDataType<T>(),
-                                     CNNL_NOT_PROPAGATE_NAN,
-                                     CNNL_REDUCE_NO_INDICES,
-                                     CNNL_32BIT_INDICES);
-    MLUCnnl::Reduce(context,
-                    true /*need_workspace*/,
-                    reduction_desc.get(),
-                    nullptr,
-                    input_desc.get(),
-                    reinterpret_cast<const void*>(in_data),
-                    0 /*indices_size*/,
-                    nullptr,
-                    nullptr,
-                    output_desc.get(),
-                    reinterpret_cast<void*>(out_data));
-  }
-};
-template <typename T>
-class MeanMLUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto output_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(
-        output_grad->numel(),
-        1,
-        platform::errors::InvalidArgument(
-            "Mean Gradient Input phi::DenseTensor len should be 1. But "
-            "received Out@Grad's elements num is %d.",
-            output_grad->numel()));
-    auto input_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(context.GetPlace());
-    auto in_data = output_grad->data<T>();
-    auto numel = input_grad->numel();
-    auto rank = input_grad->dims().size();
-    auto out_data = input_grad->data<T>();
-    auto place = context.GetPlace();
-    auto stream = context.template device_context<MLUDeviceContext>().stream();
-    if (rank == 0) {  // scalar
-      memory::Copy(place, out_data, place, in_data, numel * sizeof(T), stream);
-      return;
-    }
-    // means
-    phi::DenseTensor mean_var(output_grad->dtype());
-    mean_var.mutable_data<T>(input_grad->dims(), context.GetPlace());
-    MLUCnnlTensorDesc mean_var_desc(
-        mean_var, CNNL_LAYOUT_ARRAY, ToCnnlDataType(mean_var.dtype()));
-    auto value = static_cast<T>(1.0 / static_cast<float>(input_grad->numel()));
-    MLUCnnl::Fill(context,
-                  CNNL_POINTER_MODE_HOST,
-                  &value,
-                  mean_var_desc.get(),
-                  GetBasePtr(&mean_var));
-    // means mul output_grad
-    MLUCnnlTensorDesc in_desc(
-        *output_grad, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output_grad->dtype()));
-    MLUCnnlTensorDesc out_desc(
-        *input_grad, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input_grad->dtype()));
-    MLUCnnlOpTensorDesc op_tensor_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(context,
-                      op_tensor_desc.get(),
-                      in_desc.get(),
-                      reinterpret_cast<const void*>(in_data),
-                      mean_var_desc.get(),
-                      GetBasePtr(&mean_var),
-                      out_desc.get(),
-                      reinterpret_cast<void*>(out_data),
-                      ToCnnlDataType<T>());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(mean,
-                       ops::MeanMLUKernel<float>,
-                       ops::MeanMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(mean_grad,
-                       ops::MeanMLUGradKernel<float>,
-                       ops::MeanMLUGradKernel<plat::float16>);
--- a/paddle/fluid/operators/meshgrid_op_mlu.cc
+++ b/paddle/fluid/operators/meshgrid_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class MeshgridMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    PADDLE_ENFORCE_EQ((ins.size() > 1) && (ins.size() < 7),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Excepted phi::DenseTensor numbers between 2 and 6, "
-                          "but only received d% .",
-                          ins.size()));
-    int64_t size = ins.size();
-    std::vector<int64_t> shape(size);
-    for (int64_t i = 0; i < size; i++) {
-      switch (ins[i]->dims().size()) {
-        case 0:
-          shape[i] = 1;
-          break;
-        case 1:
-          shape[i] = ins[i]->dims()[0];
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Expected scalar or 1D tensor in the tensor list but got tensor "
-              "%d: ",
-              i));
-      }
-    }
-    MLUCnnlTensorDesc out_desc(size, shape.data(), ToCnnlDataType<T>());
-    framework::DDim out_dims = phi::make_ddim(shape);
-    for (int64_t i = 0; i < size; i++) {
-      std::vector<int64_t> view_shape(size, 1);
-      view_shape[i] = shape[i];
-      outs[i]->Resize(out_dims);
-      outs[i]->mutable_data<T>(ctx.GetPlace());
-      MLUCnnlTensorDesc in_desc(size, view_shape.data(), ToCnnlDataType<T>());
-      MLUCnnl::BroadcastTo(ctx,
-                           in_desc.get(),
-                           GetBasePtr(ins[i]),
-                           out_desc.get(),
-                           GetBasePtr(outs[i]));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_MLU_KERNEL(
-    meshgrid,
-    paddle::operators::MeshgridMLUKernel<int>,
-    paddle::operators::MeshgridMLUKernel<float>,
-    paddle::operators::MeshgridMLUKernel<int64_t>,
-    paddle::operators::MeshgridMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/one_hot_v2_op_mlu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/phi/core/tensor_utils.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class OneHotV2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int depth = ctx.Attr<int>("depth");
-    if (ctx.HasInput("depth_tensor")) {
-      std::vector<int32_t> depth_data;
-      depth_data = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("depth_tensor"));
-      depth = depth_data[0];
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-    out->mutable_data<float>(ctx.GetPlace());
-    float on_value = 1.0f, off_value = 0.0f;
-    const int in_off_dim[1] = {1};
-    phi::DenseTensor on_value_tensor =
-        ctx.AllocateTmpTensor<float, MLUDeviceContext>(
-            framework::DDim(in_off_dim, 1), dev_ctx);
-    phi::DenseTensor off_value_tensor =
-        ctx.AllocateTmpTensor<float, MLUDeviceContext>(
-            framework::DDim(in_off_dim, 1), dev_ctx);
-    FillMLUTensorWithHostValue(ctx, on_value, &on_value_tensor);
-    FillMLUTensorWithHostValue(ctx, off_value, &off_value_tensor);
-    if (framework::TransToProtoVarType(in->dtype()) ==
-        framework::proto::VarType::INT32) {
-      MLUCnnlTensorDesc desc_indices(*in);
-      MLUCnnl::OneHot(ctx,
-                      desc_indices.get(),
-                      GetBasePtr(in),
-                      depth,
-                      GetBasePtr(&on_value_tensor),
-                      GetBasePtr(&off_value_tensor),
-                      -1,
-                      ToCnnlDataType(out->dtype()),
-                      GetBasePtr(out));
-    } else {
-      phi::DenseTensor transformed_in;
-      transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
-      // use cnnlCast to cast int64_t to int32_t then do one_hot
-      MLUCnnlTensorDesc in_desc(*in);
-      MLUCnnlTensorDesc transformed_in_desc(transformed_in);
-      cnnlCastDataType_t cast_type = GetCastDataType(
-          framework::TransToProtoVarType(in->dtype()),
-          framework::TransToProtoVarType(transformed_in.dtype()));
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    in_desc.get(),
-                    GetBasePtr(in),
-                    transformed_in_desc.get(),
-                    GetBasePtr(&transformed_in));
-      MLUCnnl::OneHot(ctx,
-                      transformed_in_desc.get(),
-                      GetBasePtr(&transformed_in),
-                      depth,
-                      GetBasePtr(&on_value_tensor),
-                      GetBasePtr(&off_value_tensor),
-                      -1,
-                      ToCnnlDataType(out->dtype()),
-                      GetBasePtr(out));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(one_hot_v2,
-                       ops::OneHotV2MLUKernel<int32_t>,
-                       ops::OneHotV2MLUKernel<int64_t>);
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/phi/kernels/funcs/pooling.h"
-namespace paddle {
-namespace operators {
-namespace {
-cnnlPoolingMode_t ToCnnlPoolingMode(const std::string &pooling_type,
-                                    bool exclusive,
-                                    bool adaptive) {
-  cnnlPoolingMode_t pooling_mode;
-  if (pooling_type == "max") {
-    pooling_mode = CNNL_POOLING_MAX;
-  } else if (pooling_type == "avg") {
-    if (exclusive && !adaptive) {
-      pooling_mode = CNNL_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-    } else {
-      pooling_mode = CNNL_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-    }
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument("Unknown pooling_type: %s",
-                                                   pooling_type));
-  }
-  return pooling_mode;
-}
-}  // namespace
-template <typename T>
-class MLUPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    const phi::DenseTensor *in_x = ctx.Input<phi::DenseTensor>("X");
-    phi::DenseTensor *out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool global_pooling = ctx.Attr<bool>("global_pooling");
-    bool ceil_mode = ctx.Attr<bool>("ceil_mode");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool adaptive = ctx.Attr<bool>("adaptive");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    PADDLE_ENFORCE_EQ(in_x->dims().size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "Only support 4-dims for mlu pool2d kernel."));
-    const bool channel_last = data_format == "NHWC";
-    // default
-    cnnlTensorLayout_t cnnl_layout = CNNL_LAYOUT_NCHW;
-    auto out_dims = out->dims();
-    int64_t out_h = out_dims[2];
-    int64_t out_w = out_dims[3];
-    auto in_x_dims = in_x->dims();
-    framework::DDim data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    if (channel_last) {
-      cnnl_layout = CNNL_LAYOUT_NHWC;
-      out_h = out_dims[1];
-      out_w = out_dims[2];
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    }
-    phi::funcs::UpdatePadding(&paddings,
-                              global_pooling,
-                              adaptive,
-                              padding_algorithm,
-                              data_dims,
-                              strides,
-                              ksize);
-    if (global_pooling) {
-      phi::funcs::UpdateKernelSize(&ksize, data_dims);
-    }
-    MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType<T>());
-    MLUCnnlTensorDesc out_desc(*out, cnnl_layout, ToCnnlDataType<T>());
-    cnnlPoolingMode_t pool_mode =
-        ToCnnlPoolingMode(pooling_type, exclusive, adaptive);
-    // transpose NCHW to NHWC since cnnl pool2d has worse performance in that
-    // layout.
-    phi::DenseTensor trans_in_x;
-    phi::DenseTensor trans_out;
-    if (channel_last) {
-      trans_in_x = *in_x;
-      trans_out = *out;
-    } else {
-      std::vector<int> perm{0, 2, 3, 1};
-      TransposeFromMLUTensor<T>(
-          ctx, perm, in_x, &trans_in_x, true /*need_reshape_or_alloc*/);
-      trans_out = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          {out_dims[0], out_dims[2], out_dims[3], out_dims[1]}, dev_ctx);
-    }
-    MLUCnnlTensorDesc trans_in_x_desc(
-        trans_in_x, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
-    MLUCnnlTensorDesc trans_out_desc(
-        trans_out, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
-    if (!adaptive) {
-      MLUCnnlPoolingDesc pool_desc(pool_mode,
-                                   CNNL_NOT_PROPAGATE_NAN,
-                                   ksize[0],
-                                   ksize[1],
-                                   paddings[0],
-                                   paddings[1],
-                                   paddings[2],
-                                   paddings[3],
-                                   strides[0],
-                                   strides[1],
-                                   1 /*row_dilation*/,
-                                   1 /*col_dilation*/,
-                                   ceil_mode);
-      size_t extra_input_size = 0;
-      cnnlHandle_t handle =
-          ctx.template device_context<MLUDeviceContext>().cnnl_handle();
-      cnnlGetPoolingExtraInputSize(
-          handle, pool_mode, out_w, out_h, &extra_input_size);
-      if (extra_input_size > 0) {
-        phi::DenseTensor extra_host_tensor;
-        extra_host_tensor.mutable_data<int8_t>(
-            {static_cast<int64_t>(extra_input_size)}, platform::CPUPlace());
-        cnnlInitPoolingExtraInput(handle,
-                                  pool_desc.get(),
-                                  trans_in_x_desc.get(),
-                                  trans_out_desc.get(),
-                                  GetBasePtr(&extra_host_tensor));
-        phi::DenseTensor extra_device_tensor =
-            ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
-                {static_cast<int64_t>(extra_input_size)}, dev_ctx);
-        framework::TensorCopy(
-            extra_host_tensor, ctx.GetPlace(), &extra_device_tensor);
-        // Increase extra_host_tensor holder_ reference count until copy
-        // complete.
-        auto increase_ref_count = [extra_host_tensor]() {
-          VLOG(4) << "Finished copying extra_host_tensor["
-                  << GetBasePtr(&extra_host_tensor)
-                  << "] in mlu pooling kernel.";
-        };
-        dev_ctx.AddStreamCallback(increase_ref_count);
-        MLUCnnl::PoolingForward(
-            ctx,
-            pool_mode,
-            out_h,
-            out_w,
-            pool_desc.get(),
-            nullptr /*alpha*/,
-            trans_in_x_desc.get(),
-            GetBasePtr(&trans_in_x),
-            nullptr /*beta*/,
-            GetBasePtr(&extra_device_tensor) /*params_shape_ptr*/,
-            trans_out_desc.get(),
-            GetBasePtr(&trans_out));
-      } else {
-        MLUCnnl::PoolingForward(ctx,
-                                pool_mode,
-                                out_h,
-                                out_w,
-                                pool_desc.get(),
-                                nullptr /*alpha*/,
-                                trans_in_x_desc.get(),
-                                GetBasePtr(&trans_in_x),
-                                nullptr /*beta*/,
-                                nullptr /*params_shape_ptr*/,
-                                trans_out_desc.get(),
-                                GetBasePtr(&trans_out));
-      }
-    } else {
-      MLUCnnl::AdaptivePoolingForward(ctx,
-                                      pool_mode,
-                                      trans_in_x_desc.get(),
-                                      GetBasePtr(&trans_in_x),
-                                      trans_out_desc.get(),
-                                      GetBasePtr(&trans_out),
-                                      nullptr,
-                                      nullptr);
-    }
-    if (!channel_last) {
-      std::vector<int> perm{0, 3, 1, 2};
-      TransposeFromMLUTensor<T>(
-          ctx, perm, &trans_out, out, false /*need_reshape_or_alloc*/);
-    }
-  }
-};
-template <typename T, typename IDX_T>
-class MLUPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    const phi::DenseTensor *in_x = ctx.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor *out = ctx.Input<phi::DenseTensor>("Out");
-    const phi::DenseTensor *out_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    phi::DenseTensor *in_x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    in_x_grad->mutable_data<T>(ctx.GetPlace());
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    bool ceil_mode = ctx.Attr<bool>("ceil_mode");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool adaptive = ctx.Attr<bool>("adaptive");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool global_pooling = ctx.Attr<bool>("global_pooling");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const bool channel_last = data_format == "NHWC";
-    auto in_x_dims = in_x->dims();
-    framework::DDim data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    }
-    phi::funcs::UpdatePadding(&paddings,
-                              global_pooling,
-                              adaptive,
-                              padding_algorithm,
-                              data_dims,
-                              strides,
-                              ksize);
-    if (global_pooling) {
-      phi::funcs::UpdateKernelSize(&ksize, data_dims);
-    }
-    // inputs need with NHWC layout
-    phi::DenseTensor trans_in_x;
-    phi::DenseTensor trans_out;
-    phi::DenseTensor trans_out_grad;
-    phi::DenseTensor trans_in_x_grad;
-    if (channel_last) {
-      trans_in_x = *in_x;
-      trans_out = *out;
-      trans_out_grad = *out_grad;
-      trans_in_x_grad = *in_x_grad;
-    } else {
-      std::vector<int> perm{0, 2, 3, 1};
-      TransposeFromMLUTensor<T>(
-          ctx, perm, in_x, &trans_in_x, true /*need_reshape_or_alloc*/);
-      TransposeFromMLUTensor<T>(
-          ctx, perm, out, &trans_out, true /*need_reshape_or_alloc*/);
-      TransposeFromMLUTensor<T>(
-          ctx, perm, out_grad, &trans_out_grad, true /*need_reshape_or_alloc*/);
-      auto in_x_grad_dims = in_x_grad->dims();
-      trans_in_x_grad =
-          ctx.AllocateTmpTensor<T, MLUDeviceContext>({in_x_grad_dims[0],
-                                                      in_x_grad_dims[2],
-                                                      in_x_grad_dims[3],
-                                                      in_x_grad_dims[1]},
-                                                     dev_ctx);
-    }
-    MLUCnnlTensorDesc trans_in_x_desc(
-        trans_in_x, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
-    MLUCnnlTensorDesc trans_out_desc(
-        trans_out, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
-    MLUCnnlTensorDesc trans_out_grad_desc(
-        trans_out_grad, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
-    MLUCnnlTensorDesc trans_in_x_grad_desc(
-        trans_in_x_grad, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
-    cnnlPoolingMode_t pool_mode =
-        ToCnnlPoolingMode(pooling_type, exclusive, adaptive);
-    MLUCnnlPoolingDesc pool_desc(pool_mode,
-                                 CNNL_NOT_PROPAGATE_NAN,
-                                 ksize[0],
-                                 ksize[1],
-                                 paddings[0],
-                                 paddings[1],
-                                 paddings[2],
-                                 paddings[3],
-                                 strides[0],
-                                 strides[1],
-                                 1 /*row_dilation*/,
-                                 1 /*col_dilation*/,
-                                 ceil_mode);
-    if (pooling_type == "max") {
-      phi::DenseTensor index_tensor =
-          ctx.AllocateTmpTensor<IDX_T, MLUDeviceContext>(trans_out_grad.dims(),
-                                                         dev_ctx);
-      MLUCnnlTensorDesc index_tensor_desc(
-          index_tensor, CNNL_LAYOUT_NHWC, ToCnnlDataType<IDX_T>());
-      MLUCnnl::PoolingIndex(ctx,
-                            pool_desc.get(),
-                            trans_in_x_desc.get(),
-                            GetBasePtr(&trans_in_x),
-                            index_tensor_desc.get(),
-                            GetBasePtr(&index_tensor));
-      if (adaptive) {
-        MLUCnnl::AdaptivePoolingBackward(ctx,
-                                         pool_mode,
-                                         trans_out_grad_desc.get(),
-                                         GetBasePtr(&trans_out_grad),
-                                         index_tensor_desc.get(),
-                                         GetBasePtr(&index_tensor),
-                                         trans_in_x_grad_desc.get(),
-                                         GetBasePtr(&trans_in_x_grad));
-      } else {
-        MLUCnnl::PoolingBackward(ctx,
-                                 pool_desc.get(),
-                                 nullptr /*alpha*/,
-                                 index_tensor_desc.get(),
-                                 GetBasePtr(&index_tensor),
-                                 trans_out_grad_desc.get(),
-                                 GetBasePtr(&trans_out_grad),
-                                 trans_in_x_desc.get(),
-                                 GetBasePtr(&trans_in_x),
-                                 nullptr /*beta*/,
-                                 trans_in_x_grad_desc.get(),
-                                 GetBasePtr(&trans_in_x_grad));
-      }
-    } else {
-      if (adaptive) {
-        MLUCnnl::AdaptivePoolingBackward(ctx,
-                                         pool_mode,
-                                         trans_out_grad_desc.get(),
-                                         GetBasePtr(&trans_out_grad),
-                                         nullptr /*index_tensor_desc.get()*/,
-                                         nullptr /*GetBasePtr(&index_tensor)*/,
-                                         trans_in_x_grad_desc.get(),
-                                         GetBasePtr(&trans_in_x_grad));
-      } else {
-        MLUCnnl::PoolingBackward(ctx,
-                                 pool_desc.get(),
-                                 nullptr /*alpha*/,
-                                 nullptr,
-                                 nullptr,
-                                 trans_out_grad_desc.get(),
-                                 GetBasePtr(&trans_out_grad),
-                                 nullptr,
-                                 nullptr,
-                                 nullptr /*beta*/,
-                                 trans_in_x_grad_desc.get(),
-                                 GetBasePtr(&trans_in_x_grad));
-      }
-    }
-    if (!channel_last) {
-      std::vector<int> perm{0, 3, 1, 2};
-      TransposeFromMLUTensor<T>(ctx,
-                                perm,
-                                &trans_in_x_grad,
-                                in_x_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(pool2d,
-                       ops::MLUPoolOpKernel<float>,
-                       ops::MLUPoolOpKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(pool2d_grad,
-                       ops::MLUPoolGradOpKernel<float, int>,
-                       ops::MLUPoolGradOpKernel<plat::float16, int16_t>);
--- a/paddle/fluid/operators/randperm_op_mlu.cc
+++ b/paddle/fluid/operators/randperm_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/randperm_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class RandpermMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int n = ctx.Attr<int>("n");
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    framework::Variable* out_var = ctx.OutputVar("Out");
-    phi::DenseTensor* out_tensor =
-        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
-    phi::DenseTensor tmp_tensor;
-    tmp_tensor.Resize(phi::make_ddim({n}));
-    T* tmp_data = tmp_tensor.mutable_data<T>(platform::CPUPlace());
-    random_permate<T>(tmp_data, n, seed);
-    framework::TensorCopySync(tmp_tensor, ctx.GetPlace(), out_tensor);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-template <typename T>
-using kernel = paddle::operators::RandpermMLUKernel<T>;
-REGISTER_OP_MLU_KERNEL(
-    randperm, kernel<int64_t>, kernel<int>, kernel<float>, kernel<double>);
--- a/paddle/fluid/operators/range_op_mlu.cc
+++ b/paddle/fluid/operators/range_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/range_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class RangeMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<phi::DenseTensor>("Start");
-    auto* end_t = context.Input<phi::DenseTensor>("End");
-    auto* step_t = context.Input<phi::DenseTensor>("Step");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    phi::DenseTensor n;
-    framework::TensorCopy(
-        *start_t,
-        platform::CPUPlace(),
-        context.template device_context<platform::MLUDeviceContext>(),
-        &n);
-    context.template device_context<paddle::platform::MLUDeviceContext>()
-        .Wait();
-    T start = n.data<T>()[0];
-    framework::TensorCopy(
-        *end_t,
-        platform::CPUPlace(),
-        context.template device_context<platform::MLUDeviceContext>(),
-        &n);
-    context.template device_context<paddle::platform::MLUDeviceContext>()
-        .Wait();
-    T end = n.data<T>()[0];
-    framework::TensorCopy(
-        *step_t,
-        platform::CPUPlace(),
-        context.template device_context<platform::MLUDeviceContext>(),
-        &n);
-    context.template device_context<paddle::platform::MLUDeviceContext>()
-        .Wait();
-    T step = n.data<T>()[0];
-    int64_t size = 0;
-    GetSize(start, end, step, &size);
-    out->Resize(phi::make_ddim({size}));
-    out->mutable_data<T>(context.GetPlace());
-    std::vector<T> odata;
-    T value = start;
-    for (int64_t i = 0; i < size; ++i) {
-      odata.push_back(value);
-      value += step;
-    }
-    framework::TensorFromVector(odata, context.device_context(), out);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_MLU_KERNEL(range,
-                       paddle::operators::RangeMLUKernel<int>,
-                       paddle::operators::RangeMLUKernel<int64_t>,
-                       paddle::operators::RangeMLUKernel<float>,
-                       paddle::operators::RangeMLUKernel<double>)
--- a/paddle/fluid/operators/reshape_op_mlu.cc
+++ b/paddle/fluid/operators/reshape_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/tensor_utils.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class Reshape2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    std::vector<int32_t> target_shape_vector;
-    auto shape_tensor_vector = ctx.MultiInput<phi::DenseTensor>("ShapeTensor");
-    if (shape_tensor_vector.size() > 0) {
-      for (auto* shape_tensor : shape_tensor_vector) {
-        PADDLE_ENFORCE_EQ(
-            shape_tensor->dims().size(),
-            1,
-            platform::errors::InvalidArgument(
-                "If the element type of 'shape' in Reshape Op is Tensor, "
-                "the element's shape must be [1]. But received the element's "
-                "shape is [%d]",
-                shape_tensor->dims().size()));
-        target_shape_vector.push_back(
-            phi::GetVectorFromTensor<int>(shape_tensor)[0]);
-      }
-    } else {
-      auto* shape_tensor = ctx.HasInput("Shape")
-                               ? ctx.Input<phi::DenseTensor>("Shape")
-                               : nullptr;
-      if (shape_tensor) {
-        target_shape_vector = phi::GetVectorFromTensor<int>(shape_tensor);
-      } else {
-        target_shape_vector = ctx.Attr<std::vector<int>>("shape");
-        PADDLE_ENFORCE_GT(
-            target_shape_vector.size(),
-            0,
-            platform::errors::InvalidArgument(
-                "The length of shape attribute should be larger than 0 when "
-                "input ShapeTensor and Shape are empty!"));
-      }
-    }
-    int num_negative =
-        std::count(target_shape_vector.begin(), target_shape_vector.end(), -1);
-    PADDLE_ENFORCE_LE(
-        num_negative,
-        1,
-        platform::errors::InvalidArgument(
-            "The max number of -1 in shape attribute or shape tensor is 1 "
-            "but received %d.",
-            num_negative));
-    auto it_zero =
-        std::find(target_shape_vector.begin(), target_shape_vector.end(), 0);
-    if (it_zero != target_shape_vector.end()) {
-      int x_rank = x->dims().size();
-      for (size_t i = 0; i < target_shape_vector.size(); i++) {
-        if (target_shape_vector[i] == 0) {
-          PADDLE_ENFORCE_LT(
-              i,
-              x_rank,
-              platform::errors::InvalidArgument(
-                  "The index of 0 in shape attribute or shape tensor",
-                  "should be less than input dim size, ",
-                  "but the index is %d and input dim size is %d",
-                  i,
-                  x_rank));
-          target_shape_vector[i] = x->dims().at(i);
-        }
-      }
-    }
-    auto it =
-        std::find(target_shape_vector.begin(), target_shape_vector.end(), -1);
-    if (it != target_shape_vector.end()) {
-      auto ddim_out_vec = phi::vectorize(x->dims());
-      int ddim_out_product = std::accumulate(
-          ddim_out_vec.begin(), ddim_out_vec.end(), 1, std::multiplies<int>());
-      int reshape_out_product = std::accumulate(target_shape_vector.begin(),
-                                                target_shape_vector.end(),
-                                                -1,
-                                                std::multiplies<int>());
-      int index = std::distance(target_shape_vector.begin(), it);
-      target_shape_vector[index] = ddim_out_product / reshape_out_product;
-    }
-    auto out_dims = phi::make_ddim(target_shape_vector);
-    out->mutable_data<T>(out_dims, ctx.GetPlace());
-    // output should copy to mlu
-    framework::TensorCopy(
-        *x,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        out);
-    out->Resize(out_dims);
-  }
-};
-template <typename DeviceContext, typename T>
-class Reshape2GradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto in_dims = d_x->dims();
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        d_x);
-    d_x->Resize(in_dims);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(
-    reshape2,
-    ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext, int>,
-    ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext, int64_t>,
-    ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext, bool>,
-    ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext, double>,
-    ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
-    ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext,
-                           paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    reshape2_grad,
-    ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext, int>,
-    ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext, int64_t>,
-    ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext, bool>,
-    ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext, double>,
-    ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
-    ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext,
-                               paddle::platform::float16>);
--- a/paddle/fluid/operators/rnn_op_mlu.cc
+++ b/paddle/fluid/operators/rnn_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
-using DDim = framework::DDim;
-using TensorList = std::vector<phi::DenseTensor>;
-template <typename TensorType, typename T>
-void reset_parameter_vector(
-    const std::vector<TensorType>& raw_params_vec,
-    const int& num_layers,
-    const bool& is_bidirec,
-    std::vector<std::vector<std::pair<T*, size_t>>>* params_vec) {
-  // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers
-  // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to
-  // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers
-  const int& direction_num = is_bidirec ? 2 : 1;
-  const int& layer_weight_size = 4 * direction_num;
-  const int& all_weight_size = num_layers * layer_weight_size;
-  const int& bias_start_idx = all_weight_size / 2;
-  for (int i = 0; i < num_layers; i++) {
-    params_vec->at(i).resize(layer_weight_size);
-    for (int j = 0; j < layer_weight_size; j++) {
-      int k = j % 4;
-      const int& section = j / 4;
-      int tensor_idx = i * 2 * direction_num + section * 2 + k % 2;
-      if (k >= 2) {
-        tensor_idx += bias_start_idx;
-      }
-      using remove_cv_t = typename std::remove_cv<T>::type;
-      params_vec->at(i)[j] = std::make_pair(
-          const_cast<T*>(
-              raw_params_vec[tensor_idx]->template data<remove_cv_t>()),
-          raw_params_vec[tensor_idx]->numel() * sizeof(T));
-    }
-  }
-}
-template <typename DeviceContext, typename T>
-class RNNMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // Input
-    auto& dev_ctx = GetDevCtxFromCTX(ctx);
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto pre_state = ctx.MultiInput<phi::DenseTensor>("PreState");
-    auto weight_list = ctx.MultiInput<phi::DenseTensor>("WeightList");
-    bool has_seq_length = ctx.HasInput("SequenceLength");
-    // Output
-    auto state = ctx.MultiOutput<phi::DenseTensor>("State");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto* reserve_data = ctx.Output<phi::DenseTensor>("Reserve");
-    // Attributes
-    const int& num_layers = ctx.Attr<int>("num_layers");
-    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
-    const int& hidden_size = ctx.Attr<int>("hidden_size");
-    const std::string& mode = ctx.Attr<std::string>("mode");
-    const phi::DenseTensor* sequence_length = nullptr;
-    if (has_seq_length) {
-      sequence_length = ctx.Input<phi::DenseTensor>("SequenceLength");
-    }
-    auto init_h = pre_state[0];  // -> hx
-    auto init_c = pre_state[1];  // -> cx
-    auto last_h = state[0];
-    auto last_c = state[1];
-    // check shape
-    const int in_out_dim_num = input->dims().size();
-    const int& seq_len = input->dims()[0];  // time_step
-    const int& batch_size = input->dims()[1];
-    const int& input_dim = input->dims()[2];
-    const int& direction_num = is_bidirec ? 2 : 1;
-    int in_dim_arr[in_out_dim_num] = {seq_len, batch_size, input_dim};
-    int out_dim_arr[in_out_dim_num] = {
-        seq_len, batch_size, direction_num * hidden_size};
-    int proj_size = hidden_size;
-    std::vector<int> seq_len_vec(batch_size, seq_len);
-    if (has_seq_length) {  // set seq_len if no padding, otherwise seq_len for
-                           // each element.
-      seq_len_vec = phi::GetVectorFromTensor(sequence_length);
-    }
-    cnnlDirectionMode_t direction =
-        is_bidirec ? CNNL_RNN_BIDIRECTIONAL : CNNL_RNN_UNIDIRECTIONAL;
-    PADDLE_ENFORCE_EQ(
-        mode,
-        "LSTM",
-        platform::errors::InvalidArgument(
-            "MLU only support LSTM mode now, current mode is %s", mode));
-    PADDLE_ENFORCE_EQ(
-        num_layers,
-        1,
-        platform::errors::InvalidArgument(
-            "MLU only support 1 num_layers, current num_layers is %s",
-            num_layers));
-    PADDLE_ENFORCE_EQ(
-        init_h->dims()[0],
-        num_layers * direction_num,
-        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
-                                          " be the same as first dim of init "
-                                          "hidden, but received num_layers:%d,"
-                                          " dim:%d",
-                                          num_layers,
-                                          init_h->dims()[0]));
-    PADDLE_ENFORCE_EQ(
-        init_c->dims()[0],
-        num_layers * direction_num,
-        platform::errors::InvalidArgument(
-            "The num_layers of in RNN layer must"
-            " be the same as first dim of cell state hidden, but received"
-            " num_layers:%d, dim:%d",
-            num_layers,
-            init_c->dims()[0]));
-    // weightlist
-    std::vector<std::vector<std::pair<T*, size_t>>> parameter_lists;
-    parameter_lists.resize(num_layers);
-    reset_parameter_vector(
-        weight_list, num_layers, is_bidirec, &parameter_lists);
-    // init the output and allocate the memory
-    output->mutable_data<T>(ctx.GetPlace());  // -> y in cnnl
-    last_h->mutable_data<T>(ctx.GetPlace());  // -> hy in cnnl
-    last_c->mutable_data<T>(ctx.GetPlace());  // -> cy in cnnl
-    MLUSeqDataDesc input_seq_data_desc(CNNL_SEQDATA_TNC,
-                                       ToCnnlDataType(input->dtype()),
-                                       in_out_dim_num,
-                                       in_dim_arr,
-                                       static_cast<int>(seq_len_vec.size()),
-                                       seq_len_vec.data(),
-                                       nullptr);
-    MLUSeqDataDesc out_seq_data_desc(CNNL_SEQDATA_TNC,
-                                     ToCnnlDataType(input->dtype()),
-                                     in_out_dim_num,
-                                     out_dim_arr,
-                                     static_cast<int>(seq_len_vec.size()),
-                                     seq_len_vec.data(),
-                                     nullptr);
-    MLUCnnlTensorDesc hx_desc(*init_h);
-    MLUCnnlTensorDesc cx_desc(*init_c);
-    MLURNNDesc rnn_desc(CNNL_LSTM,
-                        CNNL_RNN_DOUBLE_BIAS,
-                        direction,
-                        CNNL_RNN_LINEAR_INPUT,
-                        ToCnnlDataType(input->dtype()),
-                        ToCnnlDataType(input->dtype()),
-                        input_dim,
-                        hidden_size,
-                        /*projection*/ proj_size,
-                        num_layers,
-                        nullptr,
-                        CNNL_RNN_PADDED_IO_DISABLED);
-    rnn_desc.SetRNNMaskMode(CNNL_LSTM_MASK_ENABLED);
-    // copy weight params
-    size_t weightspace_size;
-    phi::DenseTensor weightspace;
-    PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNWeightSpaceSize(
-        GetHandleFromCTX(ctx), rnn_desc.get(), &weightspace_size));
-    weightspace = ctx.AllocateTmpTensor<T, DeviceContext>(
-        {static_cast<int64_t>(weightspace_size)}, dev_ctx);
-    void* weightspace_ptr = weightspace.mutable_data(ctx.GetPlace());
-    auto w_x = parameter_lists[0][0];
-    auto w_h = parameter_lists[0][1];
-    auto b_x = parameter_lists[0][2];
-    auto b_h = parameter_lists[0][3];
-    auto actual_total_w_size =
-        w_x.second + w_h.second + b_x.second + b_h.second;
-    void* w_x_ptr = weightspace_ptr;
-    void* w_h_ptr = static_cast<char*>(weightspace_ptr) + w_x.second;
-    void* b_x_ptr =
-        static_cast<char*>(weightspace_ptr) + w_x.second + w_h.second;
-    void* b_h_ptr = static_cast<char*>(weightspace_ptr) + w_x.second +
-                    w_h.second + b_x.second;
-    memory::Copy(weightspace.place(),
-                 w_x_ptr,
-                 weightspace.place(),
-                 w_x.first,
-                 w_x.second,
-                 nullptr);
-    memory::Copy(weightspace.place(),
-                 w_h_ptr,
-                 weightspace.place(),
-                 w_h.first,
-                 w_h.second,
-                 nullptr);
-    memory::Copy(weightspace.place(),
-                 b_x_ptr,
-                 weightspace.place(),
-                 b_x.first,
-                 b_x.second,
-                 nullptr);
-    memory::Copy(weightspace.place(),
-                 b_h_ptr,
-                 weightspace.place(),
-                 b_h.first,
-                 b_h.second,
-                 nullptr);
-    if (is_bidirec) {
-      auto bw_x = parameter_lists[0][4];
-      auto bw_h = parameter_lists[0][5];
-      auto bb_x = parameter_lists[0][6];
-      auto bb_h = parameter_lists[0][7];
-      void* bw_x_ptr =
-          static_cast<char*>(weightspace_ptr) + actual_total_w_size;
-      void* bw_h_ptr = static_cast<char*>(weightspace_ptr) +
-                       actual_total_w_size + bw_x.second;
-      void* bb_x_ptr = static_cast<char*>(weightspace_ptr) +
-                       actual_total_w_size + bw_x.second + bw_h.second;
-      void* bb_h_ptr = static_cast<char*>(weightspace_ptr) +
-                       actual_total_w_size + bw_x.second + bw_h.second +
-                       bb_x.second;
-      actual_total_w_size +=
-          bw_x.second + bw_h.second + bb_x.second + bb_h.second;
-      memory::Copy(weightspace.place(),
-                   bw_x_ptr,
-                   weightspace.place(),
-                   bw_x.first,
-                   bw_x.second,
-                   nullptr);
-      memory::Copy(weightspace.place(),
-                   bw_h_ptr,
-                   weightspace.place(),
-                   bw_h.first,
-                   bw_h.second,
-                   nullptr);
-      memory::Copy(weightspace.place(),
-                   bb_x_ptr,
-                   weightspace.place(),
-                   bb_x.first,
-                   bb_x.second,
-                   nullptr);
-      memory::Copy(weightspace.place(),
-                   bb_h_ptr,
-                   weightspace.place(),
-                   bb_h.first,
-                   bb_h.second,
-                   nullptr);
-    }
-    PADDLE_ENFORCE_EQ(weightspace_size,
-                      actual_total_w_size,
-                      platform::errors::InvalidArgument(
-                          "The weightsize doesn't match"
-                          " weightspace_size:%d, actual_total_w_size:%d",
-                          weightspace_size,
-                          actual_total_w_size));
-    // get reservespace_ptr
-    int gate_num = 4;
-    int hidden_data_idx = (num_layers - 1);
-    hidden_data_idx += (gate_num + 1) * num_layers;
-    const int& block_size = direction_num * seq_len * batch_size * hidden_size;
-    reserve_data->Resize({hidden_data_idx, block_size});
-    reserve_data->mutable_data<T>(ctx.GetPlace());
-    MLUCnnl::RNNForward(ctx,
-                        rnn_desc.get(),
-                        seq_len_vec.data(),
-                        weightspace_ptr,
-                        weightspace_size,
-                        input_seq_data_desc.get(),
-                        GetBasePtr(input),
-                        out_seq_data_desc.get(),
-                        GetBasePtr(output),
-                        hx_desc.get(),
-                        GetBasePtr(init_h),
-                        GetBasePtr(last_h),
-                        cx_desc.get(),
-                        GetBasePtr(init_c),
-                        GetBasePtr(last_c),
-                        GetBasePtr(reserve_data));
-    if (has_seq_length) {
-      // if has_seq_length, do mask out the output of cnnlRNNForwardTraining
-      auto masked_mode = CNNL_MASKED_FILL;
-      float off_value = 0.0f;
-      phi::DenseTensor on_value_tensor(input->dtype());
-      phi::DenseTensor masked_tensor(framework::TransToPhiDataType(VT::INT8));
-      phi::DenseTensor h_masked_tensor(framework::TransToPhiDataType(VT::INT8));
-      on_value_tensor.Resize({1});
-      masked_tensor.Resize({seq_len, batch_size, direction_num * hidden_size});
-      h_masked_tensor.Resize(
-          {seq_len, batch_size, direction_num * hidden_size});
-      on_value_tensor.mutable_data<T>(ctx.GetPlace());
-      masked_tensor.mutable_data<int8_t>(ctx.GetPlace());
-      int8_t* h_masked_ptr =
-          h_masked_tensor.mutable_data<int8_t>(platform::CPUPlace());
-      for (int t = 0; t < seq_len; ++t) {
-        for (int n = 0; n < batch_size; ++n) {
-          for (int c = 0; c < direction_num * hidden_size; ++c) {
-            auto tmp_seq_len = seq_len_vec[n];
-            auto offset = t * batch_size * direction_num * hidden_size +
-                          n * direction_num * hidden_size + c;
-            *(h_masked_ptr + offset) = t >= tmp_seq_len ? 1 : 0;
-          }
-        }
-      }
-      framework::TensorCopy(
-          h_masked_tensor, ctx.GetPlace(), dev_ctx, &masked_tensor);
-      dev_ctx.Wait();
-      FillMLUTensorWithHostValue(ctx, off_value, &on_value_tensor);
-      MLUCnnlTensorDesc on_value_desc(on_value_tensor);
-      MLUCnnlTensorDesc output_desc(*output);
-      MLUCnnlTensorDesc masked_desc(masked_tensor);
-      MLUCnnl::Mask(ctx,
-                    masked_mode,
-                    output_desc.get(),
-                    GetBasePtr(output),
-                    masked_desc.get(),
-                    GetBasePtr(&masked_tensor),
-                    on_value_desc.get(),
-                    GetBasePtr(&on_value_tensor),
-                    output_desc.get(),
-                    GetBasePtr(output),
-                    nullptr);
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class RNNMLUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto stream = ctx.template device_context<MLUDeviceContext>().stream();
-    // get the tensor pointer for the input
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto pre_state = ctx.MultiInput<phi::DenseTensor>("PreState");
-    auto weight_list = ctx.MultiInput<phi::DenseTensor>("WeightList");
-    auto* output = ctx.Input<phi::DenseTensor>("Out");
-    auto* reserve_data = ctx.Input<phi::DenseTensor>("Reserve");
-    const int& num_layers = ctx.Attr<int>("num_layers");
-    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
-    const int& hidden_size = ctx.Attr<int>("hidden_size");
-    const std::string& mode = ctx.Attr<std::string>("mode");
-    bool has_seq_length = ctx.HasInput("SequenceLength");
-    const phi::DenseTensor* sequence_length = nullptr;
-    if (has_seq_length) {
-      sequence_length = ctx.Input<phi::DenseTensor>("SequenceLength");
-    }
-    PADDLE_ENFORCE_EQ(
-        mode,
-        "LSTM",
-        platform::errors::InvalidArgument(
-            "XPU only support LSTM mode now, current mode is %s", mode));
-    auto init_h = pre_state[0];  // -> hx
-    auto init_c = pre_state[1];  // -> cx
-    auto output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto state_grad =
-        ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("State"));
-    auto last_h_grad = state_grad[0];  // -> dhy
-    auto last_c_grad = state_grad[1];  // -> dcy
-    // get the tensor pointer for the output
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto weight_grad_list =
-        ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("WeightList"));
-    auto pre_state_grad =
-        ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("PreState"));
-    phi::DenseTensor* init_h_grad = nullptr;
-    phi::DenseTensor* init_c_grad = nullptr;
-    if (pre_state_grad.size() > 0) {    // has gradient
-      init_h_grad = pre_state_grad[0];  // -> dhx
-      init_c_grad = pre_state_grad[1];  // -> dcx
-    }
-    // check shape
-    const int in_out_dim_num = input->dims().size();
-    const int& seq_len = input->dims()[0];
-    const int& batch_size = input->dims()[1];
-    const int& input_dim = input->dims()[2];
-    const int& direction_num = is_bidirec ? 2 : 1;
-    int in_dim_arr[in_out_dim_num] = {seq_len, batch_size, input_dim};
-    int out_dim_arr[in_out_dim_num] = {
-        seq_len, batch_size, direction_num * hidden_size};
-    int proj_size = hidden_size;
-    PADDLE_ENFORCE_EQ(
-        num_layers,
-        1,
-        platform::errors::InvalidArgument(
-            "MLU only support 1 num_layers, current num_layers is %s",
-            num_layers));
-    PADDLE_ENFORCE_EQ(
-        init_h->dims()[0],
-        num_layers * direction_num,
-        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
-                                          " be the same as first dim of init"
-                                          "hidden, but received num_layers:%d,"
-                                          " dim:%d",
-                                          num_layers,
-                                          init_h->dims()[0]));
-    PADDLE_ENFORCE_EQ(
-        init_c->dims()[0],
-        num_layers * direction_num,
-        platform::errors::InvalidArgument(
-            "The num_layers of in RNN layer must"
-            " be the same as first dim of cell state hidden, but received"
-            " num_layers:%d, dim:%d",
-            num_layers,
-            init_c->dims()[0]));
-    std::vector<std::vector<std::pair<T*, size_t>>> parameter_lists;
-    parameter_lists.resize(num_layers);
-    reset_parameter_vector(
-        weight_list, num_layers, is_bidirec, &parameter_lists);
-    for (unsigned int i = 0; i < weight_grad_list.size(); ++i) {
-      weight_grad_list[i]->mutable_data<T>(ctx.GetPlace());
-    }
-    std::vector<std::vector<std::pair<T*, size_t>>> parameter_lists_grad;
-    parameter_lists_grad.resize(num_layers);
-    reset_parameter_vector(
-        weight_grad_list, num_layers, is_bidirec, &parameter_lists_grad);
-    // allocate the memory and initization the input_grad
-    input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), input_grad);
-    phi::DenseTensor a, b;
-    phi::DenseTensor* dynamic_grad_pre_h = &a;
-    phi::DenseTensor* dynamic_grad_pre_c = &b;
-    if (init_h_grad) {
-      init_h_grad->mutable_data<T>(last_h_grad->dims(), ctx.GetPlace());
-      FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), init_h_grad);
-    } else {
-      dynamic_grad_pre_h->Resize(last_h_grad->dims());
-      dynamic_grad_pre_h->mutable_data<T>(ctx.GetPlace());
-      FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), dynamic_grad_pre_h);
-      init_h_grad = dynamic_grad_pre_h;
-    }
-    if (init_c_grad) {
-      init_c_grad->mutable_data<T>(last_c_grad->dims(), ctx.GetPlace());
-    } else {
-      dynamic_grad_pre_c->Resize(last_h_grad->dims());
-      dynamic_grad_pre_c->mutable_data<T>(ctx.GetPlace());
-      init_c_grad = dynamic_grad_pre_c;
-    }
-    std::vector<int> seq_len_vec(batch_size, seq_len);
-    if (has_seq_length) {
-      seq_len_vec = phi::GetVectorFromTensor(sequence_length);
-    }
-    cnnlDirectionMode_t direction =
-        is_bidirec ? CNNL_RNN_BIDIRECTIONAL : CNNL_RNN_UNIDIRECTIONAL;
-    MLUSeqDataDesc input_seq_data_desc(CNNL_SEQDATA_TNC,
-                                       ToCnnlDataType(input->dtype()),
-                                       in_out_dim_num,
-                                       in_dim_arr,
-                                       static_cast<int>(seq_len_vec.size()),
-                                       seq_len_vec.data(),
-                                       nullptr);
-    MLUSeqDataDesc out_seq_data_desc(CNNL_SEQDATA_TNC,
-                                     ToCnnlDataType(input->dtype()),
-                                     in_out_dim_num,
-                                     out_dim_arr,
-                                     static_cast<int>(seq_len_vec.size()),
-                                     seq_len_vec.data(),
-                                     nullptr);
-    MLUCnnlTensorDesc hx_desc(*init_h);
-    MLUCnnlTensorDesc cx_desc(*init_c);
-    MLURNNDesc rnn_desc(CNNL_LSTM,
-                        CNNL_RNN_DOUBLE_BIAS,
-                        direction,
-                        CNNL_RNN_LINEAR_INPUT,
-                        ToCnnlDataType(input->dtype()),
-                        ToCnnlDataType(input->dtype()),
-                        input_dim,
-                        hidden_size,
-                        /*projection*/ proj_size,
-                        num_layers,
-                        nullptr,
-                        CNNL_RNN_PADDED_IO_DISABLED);
-    rnn_desc.SetRNNMaskMode(CNNL_LSTM_MASK_ENABLED);
-    // copy weight
-    size_t weightspace_size;
-    phi::DenseTensor weightspace, dweightspace;
-    PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNWeightSpaceSize(
-        GetHandleFromCTX(ctx), rnn_desc.get(), &weightspace_size));
-    weightspace = ctx.AllocateTmpTensor<T, DeviceContext>(
-        {static_cast<int64_t>(weightspace_size)}, dev_ctx);
-    dweightspace = ctx.AllocateTmpTensor<T, DeviceContext>(
-        {static_cast<int64_t>(weightspace_size)}, dev_ctx);
-    void* weightspace_ptr = weightspace.mutable_data(ctx.GetPlace());
-    auto w_x = parameter_lists[0][0];
-    auto w_h = parameter_lists[0][1];
-    auto b_x = parameter_lists[0][2];
-    auto b_h = parameter_lists[0][3];
-    auto actual_total_w_size =
-        w_x.second + w_h.second + b_x.second + b_h.second;
-    void* w_x_ptr = weightspace_ptr;
-    void* w_h_ptr = static_cast<char*>(weightspace_ptr) + w_x.second;
-    void* b_x_ptr =
-        static_cast<char*>(weightspace_ptr) + w_x.second + w_h.second;
-    void* b_h_ptr = static_cast<char*>(weightspace_ptr) + w_x.second +
-                    w_h.second + b_x.second;
-    memory::Copy(weightspace.place(),
-                 w_x_ptr,
-                 weightspace.place(),
-                 w_x.first,
-                 w_x.second,
-                 stream);
-    memory::Copy(weightspace.place(),
-                 w_h_ptr,
-                 weightspace.place(),
-                 w_h.first,
-                 w_h.second,
-                 stream);
-    memory::Copy(weightspace.place(),
-                 b_x_ptr,
-                 weightspace.place(),
-                 b_x.first,
-                 b_x.second,
-                 stream);
-    memory::Copy(weightspace.place(),
-                 b_h_ptr,
-                 weightspace.place(),
-                 b_h.first,
-                 b_h.second,
-                 stream);
-    if (is_bidirec) {
-      auto bw_x = parameter_lists[0][4];
-      auto bw_h = parameter_lists[0][5];
-      auto bb_x = parameter_lists[0][6];
-      auto bb_h = parameter_lists[0][7];
-      void* bw_x_ptr =
-          static_cast<char*>(weightspace_ptr) + actual_total_w_size;
-      void* bw_h_ptr = static_cast<char*>(weightspace_ptr) +
-                       actual_total_w_size + bw_x.second;
-      void* bb_x_ptr = static_cast<char*>(weightspace_ptr) +
-                       actual_total_w_size + bw_x.second + bw_h.second;
-      void* bb_h_ptr = static_cast<char*>(weightspace_ptr) +
-                       actual_total_w_size + bw_x.second + bw_h.second +
-                       bb_x.second;
-      actual_total_w_size +=
-          bw_x.second + bw_h.second + bb_x.second + bb_h.second;
-      memory::Copy(weightspace.place(),
-                   bw_x_ptr,
-                   weightspace.place(),
-                   bw_x.first,
-                   bw_x.second,
-                   stream);
-      memory::Copy(weightspace.place(),
-                   bw_h_ptr,
-                   weightspace.place(),
-                   bw_h.first,
-                   bw_h.second,
-                   stream);
-      memory::Copy(weightspace.place(),
-                   bb_x_ptr,
-                   weightspace.place(),
-                   bb_x.first,
-                   bb_x.second,
-                   stream);
-      memory::Copy(weightspace.place(),
-                   bb_h_ptr,
-                   weightspace.place(),
-                   bb_h.first,
-                   bb_h.second,
-                   stream);
-    }
-    dev_ctx.Wait();
-    PADDLE_ENFORCE_EQ(weightspace_size,
-                      actual_total_w_size,
-                      platform::errors::InvalidArgument(
-                          "The weightsize doesn't match"
-                          " weightspace_size:%d, actual_total_w_size:%d",
-                          weightspace_size,
-                          actual_total_w_size));
-    MLUCnnl::RNNBackward(ctx,
-                         rnn_desc.get(),
-                         CNNL_WGRAD_MODE_SET,
-                         seq_len_vec.data(),
-                         GetBasePtr(&weightspace),
-                         GetBasePtr(&dweightspace),
-                         weightspace.numel() * sizeof(T),
-                         input_seq_data_desc.get(),
-                         GetBasePtr(input),
-                         GetBasePtr(input_grad),
-                         out_seq_data_desc.get(),
-                         GetBasePtr(output),
-                         GetBasePtr(output_grad),
-                         hx_desc.get(),
-                         GetBasePtr(init_h),
-                         GetBasePtr(last_h_grad),
-                         GetBasePtr(init_h_grad),
-                         cx_desc.get(),
-                         GetBasePtr(init_c),
-                         GetBasePtr(last_c_grad),
-                         GetBasePtr(init_c_grad),
-                         const_cast<void*>(GetBasePtr(reserve_data)),
-                         reserve_data->numel() * sizeof(T));
-    void* dweightspace_ptr = dweightspace.mutable_data(ctx.GetPlace());
-    auto dw_x = parameter_lists_grad[0][0];
-    auto dw_h = parameter_lists_grad[0][1];
-    auto db_x = parameter_lists_grad[0][2];
-    auto db_h = parameter_lists_grad[0][3];
-    auto dactual_total_w_size =
-        dw_x.second + dw_h.second + db_x.second + db_h.second;
-    void* dw_x_ptr = dweightspace_ptr;
-    void* dw_h_ptr = static_cast<char*>(dweightspace_ptr) + dw_x.second;
-    void* db_x_ptr =
-        static_cast<char*>(dweightspace_ptr) + dw_x.second + dw_h.second;
-    void* db_h_ptr = static_cast<char*>(dweightspace_ptr) + dw_x.second +
-                     dw_h.second + db_x.second;
-    memory::Copy(weightspace.place(),
-                 dw_x.first,
-                 weightspace.place(),
-                 dw_x_ptr,
-                 dw_x.second,
-                 stream);
-    memory::Copy(weightspace.place(),
-                 dw_h.first,
-                 weightspace.place(),
-                 dw_h_ptr,
-                 dw_h.second,
-                 stream);
-    memory::Copy(weightspace.place(),
-                 db_x.first,
-                 weightspace.place(),
-                 db_x_ptr,
-                 db_x.second,
-                 stream);
-    memory::Copy(weightspace.place(),
-                 db_h.first,
-                 weightspace.place(),
-                 db_h_ptr,
-                 db_h.second,
-                 stream);
-    if (is_bidirec) {
-      auto dbw_x = parameter_lists_grad[0][4];
-      auto dbw_h = parameter_lists_grad[0][5];
-      auto dbb_x = parameter_lists_grad[0][6];
-      auto dbb_h = parameter_lists_grad[0][7];
-      void* dbw_x_ptr =
-          static_cast<char*>(dweightspace_ptr) + dactual_total_w_size;
-      void* dbw_h_ptr = static_cast<char*>(dweightspace_ptr) +
-                        dactual_total_w_size + dbw_x.second;
-      void* dbb_x_ptr = static_cast<char*>(dweightspace_ptr) +
-                        dactual_total_w_size + dbw_x.second + dbw_h.second;
-      void* dbb_h_ptr = static_cast<char*>(dweightspace_ptr) +
-                        dactual_total_w_size + dbw_x.second + dbw_h.second +
-                        dbb_x.second;
-      dactual_total_w_size +=
-          dbw_x.second + dbw_h.second + dbb_x.second + dbb_h.second;
-      memory::Copy(weightspace.place(),
-                   dbw_x.first,
-                   weightspace.place(),
-                   dbw_x_ptr,
-                   dbw_x.second,
-                   stream);
-      memory::Copy(weightspace.place(),
-                   dbw_h.first,
-                   weightspace.place(),
-                   dbw_h_ptr,
-                   dbw_h.second,
-                   stream);
-      memory::Copy(weightspace.place(),
-                   dbb_x.first,
-                   weightspace.place(),
-                   dbb_x_ptr,
-                   dbb_x.second,
-                   stream);
-      memory::Copy(weightspace.place(),
-                   dbb_h.first,
-                   weightspace.place(),
-                   dbb_h_ptr,
-                   dbb_h.second,
-                   stream);
-    }
-    dev_ctx.Wait();
-    PADDLE_ENFORCE_EQ(weightspace_size,
-                      dactual_total_w_size,
-                      platform::errors::InvalidArgument(
-                          "The weightsize doesn't match"
-                          " weightspace_size:%d, dactual_total_w_size:%d",
-                          weightspace_size,
-                          dactual_total_w_size));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(
-    rnn, ops::RNNMLUKernel<paddle::platform::MLUDeviceContext, float>);
-REGISTER_OP_MLU_KERNEL(
-    rnn_grad, ops::RNNMLUGradKernel<paddle::platform::MLUDeviceContext, float>);
--- a/paddle/fluid/operators/roi_align_op_mlu.cc
+++ b/paddle/fluid/operators/roi_align_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    out->set_layout(phi::DataLayout::kNHWC);
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-    const auto& in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-    auto cplace = platform::CPUPlace();
-    std::vector<int> roi_batch_id_list(rois_num);
-    int rois_batch_size = 0;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<phi::DenseTensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size,
-          batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and the batch size of images "
-              " must be the same. But received the batch size of rois is %d, "
-              "and the batch size of images is %d",
-              rois_batch_size,
-              batch_size));
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace,
-                   rois_num_list.data(),
-                   ctx.GetPlace(),
-                   rois_num_t->data<int>(),
-                   sizeof(int) * rois_batch_size,
-                   nullptr /*stream*/);
-      int last_idx = 0;
-      for (int i = 0; i < rois_batch_size; i++) {
-        int end_idx = last_idx + rois_num_list[i];
-        for (int j = last_idx; j < end_idx; j++) {
-          roi_batch_id_list[j] = i;
-        }
-        last_idx = end_idx;
-      }
-    } else {
-      auto lod = rois->lod();
-      PADDLE_ENFORCE_EQ(lod.empty(),
-                        false,
-                        platform::errors::InvalidArgument(
-                            "Input(ROIs) phi::DenseTensor of ROIAlignOp "
-                            "does not contain LoD information."));
-      auto rois_lod = lod.back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(rois_batch_size,
-                        batch_size,
-                        platform::errors::InvalidArgument(
-                            "The rois_batch_size and imgs "
-                            "batch_size must be the same. But received "
-                            "rois_batch_size = %d, "
-                            "batch_size = %d",
-                            rois_batch_size,
-                            batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num,
-          rois_num_with_lod,
-          platform::errors::InvalidArgument(
-              "The actual number of rois and the number of rois "
-              "provided from Input(RoIsLoD) in RoIAlign must be the same."
-              " But received actual number of rois is %d, and the number "
-              "of rois from RoIsLoD is %d",
-              rois_num,
-              rois_num_with_lod));
-      for (int i = 0; i < rois_batch_size; i++) {
-        int start_idx = rois_lod[i];
-        int end_idx = rois_lod[i + 1];
-        for (int j = start_idx; j < end_idx; j++) {
-          roi_batch_id_list[j] = i;
-        }
-      }
-    }
-    // only support float32 for now
-    phi::DenseTensor rois_cpu(framework::TransToPhiDataType(VT::FP32));
-    rois_cpu.Resize({rois_num, 4});
-    rois_cpu.mutable_data<T>(ctx.GetPlace());
-    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    framework::TensorCopy(*rois, cplace, dev_ctx, &rois_cpu);
-    dev_ctx.Wait();
-    T* rois_cpu_ptr = rois_cpu.mutable_data<T>(platform::CPUPlace());
-    // boxes; [batch_idx, x1, y1, x2, y2]
-    phi::DenseTensor boxes_cpu(framework::TransToPhiDataType(VT::FP32));
-    phi::DenseTensor boxes_mlu(framework::TransToPhiDataType(VT::FP32));
-    boxes_cpu.Resize({rois_num, 5});
-    boxes_mlu.Resize({rois_num, 5});
-    T* boxes_cpu_ptr = boxes_cpu.mutable_data<T>(platform::CPUPlace());
-    boxes_mlu.mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < rois_num; ++i) {
-      boxes_cpu_ptr[i * 5 + 0] = static_cast<T>(roi_batch_id_list[i]);
-      boxes_cpu_ptr[i * 5 + 1] = rois_cpu_ptr[i * 4 + 0];
-      boxes_cpu_ptr[i * 5 + 2] = rois_cpu_ptr[i * 4 + 1];
-      boxes_cpu_ptr[i * 5 + 3] = rois_cpu_ptr[i * 4 + 2];
-      boxes_cpu_ptr[i * 5 + 4] = rois_cpu_ptr[i * 4 + 3];
-    }
-    // copy boxes_cpu to boxes_mlu
-    framework::TensorCopy(boxes_cpu, ctx.GetPlace(), dev_ctx, &boxes_mlu);
-    dev_ctx.Wait();
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-    phi::DenseTensor input_nhwc(in->type());
-    phi::DenseTensor output_nhwc(out->type());
-    TransposeFromMLUTensor<T>(
-        ctx, perm_to_nhwc, in, &input_nhwc, true /*need_reshape_or_alloc*/);
-    auto output_dims = out->dims();
-    output_nhwc.mutable_data<T>(
-        {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
-        ctx.GetPlace());
-    MLUCnnlTensorDesc input_desc(
-        input_nhwc, CNNL_LAYOUT_NHWC, ToCnnlDataType(input_nhwc.dtype()));
-    MLUCnnlTensorDesc boxes_desc(boxes_mlu);
-    MLUCnnlTensorDesc out_desc(
-        output_nhwc, CNNL_LAYOUT_NHWC, ToCnnlDataType(output_nhwc.dtype()));
-    MLUCnnl::RoiAlign(ctx,
-                      pooled_height,
-                      pooled_width,
-                      sampling_ratio,
-                      spatial_scale,
-                      aligned,
-                      input_desc.get(),
-                      GetBasePtr(&input_nhwc),
-                      boxes_desc.get(),
-                      GetBasePtr(&boxes_mlu),
-                      out_desc.get(),
-                      GetBasePtr(&output_nhwc));
-    TransposeFromMLUTensor<T>(
-        ctx, perm_to_nchw, &output_nhwc, out, false /*need_reshape_or_alloc*/);
-  };
-};
-template <typename T>
-class ROIAlignGradOpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto spatial_scale = ctx.Attr<T>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-    int rois_num = rois->dims()[0];
-    if (!in_grad) {
-      return;
-    }
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> roi_batch_id_list(rois_num);
-    auto cplace = platform::CPUPlace();
-    int rois_batch_size = 0;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<phi::DenseTensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace,
-                   rois_num_list.data(),
-                   ctx.GetPlace(),
-                   rois_num_t->data<int>(),
-                   sizeof(int) * rois_batch_size,
-                   nullptr /*stream*/);
-      int last_idx = 0;
-      for (int i = 0; i < rois_batch_size; i++) {
-        int end_idx = last_idx + rois_num_list[i];
-        for (int j = last_idx; j < end_idx; j++) {
-          roi_batch_id_list[j] = i;
-        }
-        last_idx = end_idx;
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      for (int i = 0; i < rois_batch_size; i++) {
-        int start_idx = rois_lod[i];
-        int end_idx = rois_lod[i + 1];
-        for (int j = start_idx; j < end_idx; j++) {
-          roi_batch_id_list[j] = i;
-        }
-      }
-    }
-    phi::DenseTensor rois_cpu(framework::TransToPhiDataType(VT::FP32));
-    rois_cpu.Resize({rois_num, 4});
-    rois_cpu.mutable_data<T>(ctx.GetPlace());
-    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    framework::TensorCopy(*rois, cplace, dev_ctx, &rois_cpu);
-    dev_ctx.Wait();
-    T* rois_cpu_ptr = rois_cpu.mutable_data<T>(platform::CPUPlace());
-    // boxes; [batch_idx, x1, y1, x2, y2]
-    phi::DenseTensor boxes_cpu(framework::TransToPhiDataType(VT::FP32));
-    phi::DenseTensor boxes_mlu(framework::TransToPhiDataType(VT::FP32));
-    boxes_cpu.Resize({rois_num, 5});
-    boxes_mlu.Resize({rois_num, 5});
-    T* boxes_cpu_ptr = boxes_cpu.mutable_data<T>(platform::CPUPlace());
-    boxes_mlu.mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < rois_num; ++i) {
-      boxes_cpu_ptr[i * 5 + 0] = static_cast<T>(roi_batch_id_list[i]);
-      boxes_cpu_ptr[i * 5 + 1] = rois_cpu_ptr[i * 4 + 0];
-      boxes_cpu_ptr[i * 5 + 2] = rois_cpu_ptr[i * 4 + 1];
-      boxes_cpu_ptr[i * 5 + 3] = rois_cpu_ptr[i * 4 + 2];
-      boxes_cpu_ptr[i * 5 + 4] = rois_cpu_ptr[i * 4 + 3];
-    }
-    // copy boxes_cpu to boxes_mlu
-    framework::TensorCopy(boxes_cpu, ctx.GetPlace(), dev_ctx, &boxes_mlu);
-    dev_ctx.Wait();
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-    phi::DenseTensor grads_nhwc(out_grad->type());
-    phi::DenseTensor grads_image_nhwc(in_grad->type());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              out_grad,
-                              &grads_nhwc,
-                              true /*need_reshape_or_alloc*/);
-    auto grads_image_dims = in_grad->dims();
-    grads_image_nhwc.mutable_data<T>({grads_image_dims[0],
-                                      grads_image_dims[2],
-                                      grads_image_dims[3],
-                                      grads_image_dims[1]},
-                                     ctx.GetPlace());
-    MLUCnnlTensorDesc grads_desc(
-        grads_nhwc, CNNL_LAYOUT_NHWC, ToCnnlDataType(grads_nhwc.dtype()));
-    MLUCnnlTensorDesc boxes_desc(boxes_mlu);
-    MLUCnnlTensorDesc grads_image_desc(
-        grads_image_nhwc,
-        CNNL_LAYOUT_NHWC,
-        ToCnnlDataType(grads_image_nhwc.dtype()));
-    MLUCnnl::RoiAlignBackward(ctx,
-                              sampling_ratio,
-                              spatial_scale,
-                              aligned,
-                              grads_desc.get(),
-                              GetBasePtr(&grads_nhwc),
-                              boxes_desc.get(),
-                              GetBasePtr(&boxes_mlu),
-                              grads_image_desc.get(),
-                              GetBasePtr(&grads_image_nhwc));
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nchw,
-                              &grads_image_nhwc,
-                              in_grad,
-                              false /*need_reshape_or_alloc*/);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(roi_align, ops::ROIAlignOpMLUKernel<float>);
-REGISTER_OP_MLU_KERNEL(roi_align_grad, ops::ROIAlignGradOpMLUKernel<float>);
--- a/paddle/fluid/operators/scale_op_mlu.cc
+++ b/paddle/fluid/operators/scale_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ScaleMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = GetDevCtxFromCTX(ctx);
-    auto* in_var = ctx.InputVar("X");
-    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
-    // cnnl require input, scale, bias with same type. And all in device side.
-    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
-    phi::DenseTensor scale_tensor;
-    if (ctx.HasInput("ScaleTensor")) {
-      phi::DenseTensor float_scale_tensor =
-          *ctx.Input<phi::DenseTensor>("ScaleTensor");
-      if (framework::TransToProtoVarType(float_scale_tensor.dtype()) !=
-          framework::TransToProtoVarType(in->dtype())) {
-        scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-        MLUCnnlTensorDesc float_scale_desc(float_scale_tensor);
-        MLUCnnlTensorDesc final_scale_desc(scale_tensor);
-        cnnlCastDataType_t cast_type = GetCastDataType(
-            framework::TransToProtoVarType(float_scale_tensor.dtype()),
-            framework::TransToProtoVarType(scale_tensor.dtype()));
-        MLUCnnl::Cast(ctx,
-                      cast_type,
-                      float_scale_desc.get(),
-                      GetBasePtr(&float_scale_tensor),
-                      final_scale_desc.get(),
-                      GetBasePtr(&scale_tensor));
-      } else {
-        scale_tensor = float_scale_tensor;
-      }
-    } else {
-      scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-      MLUCnnlTensorDesc scale_desc(scale_tensor);
-      MLUCnnl::Fill(ctx,
-                    CNNL_POINTER_MODE_HOST,
-                    &scale,
-                    scale_desc.get(),
-                    GetBasePtr(&scale_tensor));
-    }
-    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
-    phi::DenseTensor bias_tensor =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    MLUCnnlTensorDesc bias_desc(bias_tensor);
-    MLUCnnl::Fill(ctx,
-                  CNNL_POINTER_MODE_HOST,
-                  &bias,
-                  bias_desc.get(),
-                  GetBasePtr(&bias_tensor));
-    auto* out_var = ctx.OutputVar("Out");
-    if (in_var->IsType<phi::SelectedRows>() && in_var != out_var) {
-      auto& in_slr = in_var->Get<phi::SelectedRows>();
-      auto* out_slr = out_var->GetMutable<phi::SelectedRows>();
-      out_slr->set_rows(in_slr.rows());
-      out_slr->set_height(in_slr.height());
-    }
-    auto* out =
-        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
-    out->mutable_data<T>(in->place());
-    MLUCnnlTensorDesc input_desc(*in);
-    MLUCnnlTensorDesc scale_desc(scale_tensor);
-    MLUCnnlTensorDesc output_desc(*out);
-    const int axis = std::max(in->dims().size() - 1, 0);
-    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-    if (bias_after_scale) {
-      MLUCnnl::Scale(ctx,
-                     axis,
-                     input_desc.get(),
-                     GetBasePtr(in),
-                     scale_desc.get(),
-                     GetBasePtr(&scale_tensor),
-                     bias_desc.get(),
-                     GetBasePtr(&bias_tensor),
-                     output_desc.get(),
-                     GetBasePtr(out));
-    } else {
-      phi::DenseTensor new_bias_tensor =
-          ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-      MLUCnnlTensorDesc new_bias_desc(new_bias_tensor);
-      MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL,
-                                      ToCnnlDataType(in->dtype()),
-                                      CNNL_NOT_PROPAGATE_NAN);
-      MLUCnnl::OpTensor(ctx,
-                        mul_op_desc.get(),
-                        scale_desc.get(),
-                        GetBasePtr(&scale_tensor),
-                        bias_desc.get(),
-                        GetBasePtr(&bias_tensor),
-                        new_bias_desc.get(),
-                        GetBasePtr(&new_bias_tensor),
-                        ToCnnlDataType(in->dtype()));
-      MLUCnnl::Scale(ctx,
-                     axis,
-                     input_desc.get(),
-                     GetBasePtr(in),
-                     scale_desc.get(),
-                     GetBasePtr(&scale_tensor),
-                     new_bias_desc.get(),
-                     GetBasePtr(&new_bias_tensor),
-                     output_desc.get(),
-                     GetBasePtr(out));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(scale,
-                       ops::ScaleMLUKernel<float>,
-                       ops::ScaleMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/scatter_op_mlu.cc
+++ b/paddle/fluid/operators/scatter_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ScatterMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* indices = ctx.Input<phi::DenseTensor>("Ids");
-    auto* updates = ctx.Input<phi::DenseTensor>("Updates");
-    bool overwrite = ctx.Attr<bool>("overwrite");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc indices_desc(*indices);
-    MLUCnnlTensorDesc updates_desc(*updates);
-    MLUCnnlTensorDesc out_desc(*out);
-    cnnlScatterRefMode_t mode;
-    if (overwrite) {
-      mode = CNNL_SCATTERREF_UPDATE;
-      MLUCnnl::ScatterRefFunctor(ctx,
-                                 x_desc.get(),
-                                 GetBasePtr(x),
-                                 updates_desc.get(),
-                                 GetBasePtr(updates),
-                                 indices_desc.get(),
-                                 GetBasePtr(indices),
-                                 mode);
-    } else {
-      phi::DenseTensor tensor_zeros(updates->type());
-      tensor_zeros.mutable_data<T>(updates->dims(), ctx.GetPlace());
-      MLUCnnlTensorDesc tensor_zeros_desc(tensor_zeros);
-      float value = 0.0;
-      auto value_t = static_cast<T>(value);
-      MLUCnnl::Fill(ctx,
-                    CNNL_POINTER_MODE_HOST,
-                    &value_t,
-                    tensor_zeros_desc.get(),
-                    GetBasePtr(&tensor_zeros));
-      mode = CNNL_SCATTERREF_UPDATE;
-      MLUCnnl::ScatterRefFunctor(ctx,
-                                 x_desc.get(),
-                                 GetBasePtr(x),
-                                 tensor_zeros_desc.get(),
-                                 GetBasePtr(&tensor_zeros),
-                                 indices_desc.get(),
-                                 GetBasePtr(indices),
-                                 mode);
-      mode = CNNL_SCATTERREF_ADD;
-      MLUCnnl::ScatterRefFunctor(ctx,
-                                 x_desc.get(),
-                                 GetBasePtr(x),
-                                 updates_desc.get(),
-                                 GetBasePtr(updates),
-                                 indices_desc.get(),
-                                 GetBasePtr(indices),
-                                 mode);
-    }
-    paddle::framework::TensorCopy(*x, place, out);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(scatter,
-                       ops::ScatterMLUKernel<float>,
-                       ops::ScatterMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/set_value_op_mlu.cc
+++ b/paddle/fluid/operators/set_value_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <numeric>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/set_value_op.h"
-namespace paddle {
-namespace operators {
-using MLUDeviceContext = platform::MLUDeviceContext;
-template <typename T>
-class SetValueMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<phi::DenseTensor>("Input");
-    auto* value_tensor = ctx.Input<phi::DenseTensor>("ValueTensor");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    auto starts_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    auto steps_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("StepsTensorList");
-    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    auto starts = ctx.Attr<std::vector<int64_t>>("starts");
-    auto ends = ctx.Attr<std::vector<int64_t>>("ends");
-    auto steps = ctx.Attr<std::vector<int64_t>>("steps");
-    auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-    auto decrease_axes = ctx.Attr<std::vector<int64_t>>("decrease_axes");
-    auto none_axes = ctx.Attr<std::vector<int64_t>>("none_axes");
-    if (!starts_tensor_list.empty()) {
-      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
-    }
-    if (!ends_tensor_list.empty()) {
-      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
-    }
-    if (!steps_tensor_list.empty()) {
-      steps = GetDataFromTensorList<int64_t>(steps_tensor_list);
-    }
-    auto in_dims = in->dims();
-    phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
-    auto slice_dims =
-        phi::funcs::GetSliceDims(in_dims, axes, starts, ends, &steps);
-    auto decrease_slice_dims =
-        phi::funcs::GetDecreasedDims(slice_dims, decrease_axes);
-    auto slice_dims_for_assign = decrease_slice_dims;
-    if (!none_axes.empty()) {
-      std::vector<int64_t> slice_dims_with_none;
-      size_t none_axes_cur = 0, decrease_axes_cur = 0;
-      for (int i = 0; i < slice_dims.size(); ++i) {
-        while (none_axes_cur < none_axes.size() &&
-               none_axes[none_axes_cur] <= i) {
-          slice_dims_with_none.push_back(1);
-          none_axes_cur++;
-        }
-        if (decrease_axes_cur < decrease_axes.size() &&
-            decrease_axes[decrease_axes_cur] == i) {
-          decrease_axes_cur++;
-        } else {
-          slice_dims_with_none.push_back(slice_dims[i]);
-        }
-      }
-      while (none_axes_cur < none_axes.size()) {
-        slice_dims_with_none.push_back(1);
-        none_axes_cur++;
-      }
-      slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
-    }
-    int in_size = in_dims.size();
-    int starts_indices[in_size] = {0};
-    int ends_indices[in_size] = {0};
-    int strides_indices[in_size] = {0};
-    for (int i = 0; i < in_dims.size(); ++i) {
-      starts_indices[i] = 0;
-      ends_indices[i] = static_cast<int>(slice_dims[i]);
-      strides_indices[i] = 1;
-    }
-    for (size_t i = 0; i < axes.size(); i++) {
-      int axis_index = axes[i];
-      starts_indices[axis_index] = static_cast<int>(starts[i]);
-      ends_indices[axis_index] = static_cast<int>(ends[i]);
-      strides_indices[axis_index] = static_cast<int>(steps[i]);
-    }
-    phi::DenseTensor value_t(in->type());
-    if (value_tensor != nullptr) {
-      value_t.ShareDataWith(*value_tensor);
-    } else {
-      auto value_dims = phi::make_ddim(shape);
-      CheckIsDimsMatch(slice_dims_for_assign, value_dims);
-      value_t.mutable_data<T>(value_dims, ctx.GetPlace());
-      auto value_name =
-          GetValueName(framework::TransToProtoVarType(in->dtype()));
-      CopyVectorToTensor<T>(value_name.c_str(), &value_t, ctx);
-      value_t.Resize(value_dims);
-    }
-    phi::DenseTensor value_temp(in->type());
-    if (slice_dims_for_assign == value_t.dims()) {
-      value_temp.ShareDataWith(value_t);
-    } else {
-      value_temp.Resize(slice_dims_for_assign);
-      value_temp.mutable_data<T>(ctx.GetPlace());
-      MLUCnnlTensorDesc value_t_desc(value_t);
-      MLUCnnlTensorDesc value_temp_desc(value_temp);
-      MLUCnnl::BroadcastTo(ctx,
-                           value_t_desc.get(),
-                           GetBasePtr(&value_t),
-                           value_temp_desc.get(),
-                           GetBasePtr(&value_temp));
-    }
-    int64_t input_numel = phi::product(in_dims);
-    int64_t value_numel = phi::product(value_temp.dims());
-    phi::DenseTensor in_temp, out_temp, val_temp, index_out;
-    int64_t stride_step = phi::product(in_dims);
-    std::vector<int64_t> index_indices(stride_step);
-    std::iota(index_indices.begin(), index_indices.end(), 0);
-    phi::DenseTensor index_temp;
-    in_temp.ShareDataWith(*in);
-    val_temp.ShareDataWith(value_temp);
-    paddle::framework::TensorFromVector(
-        index_indices, ctx.device_context(), &index_temp);
-    index_temp.Resize(in_dims);
-    auto index_dims = in_dims;
-    for (int i = 0; i < in_dims.size(); ++i) {
-      if (starts_indices[i] < 0 || ends_indices[i] < 0) {
-        starts_indices[i] -= in_dims[i];
-        ends_indices[i] -= in_dims[i];
-      }
-      if (strides_indices[i] > 0)
-        index_dims[i] =
-            static_cast<int>((ends_indices[i] - starts_indices[i] - 1) /
-                             strides_indices[i]) +
-            1;
-      else
-        index_dims[i] =
-            static_cast<int>((ends_indices[i] - starts_indices[i] + 1) /
-                             strides_indices[i]) +
-            1;
-    }
-    auto new_in_dims = phi::make_ddim({input_numel});
-    auto new_val_dims = phi::make_ddim({value_numel});
-    in_temp.Resize(new_in_dims);
-    val_temp.Resize(new_val_dims);
-    index_out.Resize(index_dims);
-    index_out.mutable_data<int64_t>(ctx.GetPlace());
-    cnnlScatterRefMode_t mode = CNNL_SCATTERREF_UPDATE;
-    MLUCnnlTensorDesc x_desc(in_temp);
-    MLUCnnlTensorDesc indices_desc(index_temp);
-    MLUCnnlTensorDesc indices_out_desc(index_out);
-    MLUCnnlTensorDesc updates_desc(val_temp);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::StridedSlice(ctx,
-                          starts_indices,
-                          ends_indices,
-                          strides_indices,
-                          indices_desc.get(),
-                          GetBasePtr(&index_temp),
-                          indices_out_desc.get(),
-                          GetBasePtr(&index_out));
-    PADDLE_ENFORCE_EQ(
-        static_cast<int64_t>(phi::product(index_out.dims())),
-        phi::product(slice_dims_for_assign),
-        platform::errors::InvalidArgument(
-            "OP(set_value) error index indices and value update not match "));
-    phi::DenseTensor index_final;
-    index_final.ShareDataWith(index_out);
-    int64_t indices_numel = phi::product(index_dims);
-    auto new_index_dims = phi::make_ddim({indices_numel});
-    index_final.Resize(new_index_dims);
-    MLUCnnlTensorDesc indices_final_desc(index_final);
-    MLUCnnl::ScatterRefFunctor(ctx,
-                               x_desc.get(),
-                               GetBasePtr(&in_temp),
-                               updates_desc.get(),
-                               GetBasePtr(&val_temp),
-                               indices_final_desc.get(),
-                               GetBasePtr(&index_final),
-                               mode);
-    in_temp.Resize(in_dims);
-    paddle::framework::TensorCopy(in_temp, ctx.GetPlace(), out);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(set_value,
-                       ops::SetValueMLUKernel<int>,
-                       ops::SetValueMLUKernel<float>);
--- a/paddle/fluid/operators/shape_op_mlu.cc
+++ b/paddle/fluid/operators/shape_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_WITH_MLU
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-using SelectedRows = phi::SelectedRows;
-template <typename T>
-class ShapeMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_var = ctx.InputVar("Input");
-    framework::DDim in_dims;
-    if (in_var->IsType<phi::SelectedRows>()) {
-      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
-    } else {
-      in_dims = in_var->Get<phi::DenseTensor>().dims();
-    }
-    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
-    out_t->Resize({in_dims.size()});
-    out_t->mutable_data<int32_t>(ctx.GetPlace());
-    // shape op cpu
-    phi::DenseTensor shape_on_cpu(
-        framework::TransToPhiDataType(framework::proto::VarType::INT32));
-    shape_on_cpu.Resize({in_dims.size()});
-    auto cpu_data = shape_on_cpu.mutable_data<int32_t>(platform::CPUPlace());
-    for (int i = 0; i < in_dims.size(); ++i) {
-      cpu_data[i] = in_dims[i];
-    }
-    // cpu to mlu
-    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    framework::TensorCopy(shape_on_cpu, ctx.GetPlace(), dev_ctx, out_t);
-    dev_ctx.Wait();
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(shape,
-                       ops::ShapeMLUKernel<bool>,
-                       ops::ShapeMLUKernel<uint8_t>,
-                       ops::ShapeMLUKernel<int8_t>,
-                       ops::ShapeMLUKernel<int>,
-                       ops::ShapeMLUKernel<int64_t>,
-                       ops::ShapeMLUKernel<paddle::platform::float16>,
-                       ops::ShapeMLUKernel<float>,
-                       ops::ShapeMLUKernel<double>);
-#endif
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-const int kIgnoreIndex = -100;
-void CheckAttrs(const framework::ExecutionContext& ctx) {
-  // cnnl not support normalize and ignore_index
-  bool normalize = ctx.Attr<bool>("normalize");
-  int ignore_index = ctx.Attr<int>("ignore_index");
-  PADDLE_ENFORCE_EQ(normalize,
-                    false,
-                    platform::errors::InvalidArgument(
-                        "attr normalize must be false, but got true"));
-  PADDLE_ENFORCE_EQ(ignore_index,
-                    kIgnoreIndex,
-                    platform::errors::InvalidArgument(
-                        "attr ignore_index must be default %d, but got %d",
-                        kIgnoreIndex,
-                        ignore_index));
-}
-template <typename T>
-class SigmoidCrossEntropyWithLogitsMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    CheckAttrs(ctx);
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc label_desc(*label);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::BceWithLogits(ctx,
-                           CNNL_BCE_WITH_LOGITS_NONE,
-                           x_desc.get(),
-                           GetBasePtr(x),
-                           label_desc.get(),
-                           GetBasePtr(label),
-                           nullptr,
-                           nullptr,
-                           nullptr,
-                           nullptr,
-                           out_desc.get(),
-                           GetBasePtr(out));
-  }
-};
-template <typename T>
-class SigmoidCrossEntropyWithLogitsMLUGradKernel
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    CheckAttrs(ctx);
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto place = ctx.GetPlace();
-    dx->mutable_data<T>(place);
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc label_desc(*label);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnl::BceWithLogitsBackward(ctx,
-                                   CNNL_BCE_WITH_LOGITS_NONE,
-                                   dout_desc.get(),
-                                   GetBasePtr(dout),
-                                   x_desc.get(),
-                                   GetBasePtr(x),
-                                   label_desc.get(),
-                                   GetBasePtr(label),
-                                   nullptr,
-                                   nullptr,
-                                   nullptr,
-                                   nullptr,
-                                   x_desc.get(),
-                                   GetBasePtr(dx));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(
-    sigmoid_cross_entropy_with_logits,
-    ops::SigmoidCrossEntropyWithLogitsMLUKernel<float>,
-    ops::SigmoidCrossEntropyWithLogitsMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(
-    sigmoid_cross_entropy_with_logits_grad,
-    ops::SigmoidCrossEntropyWithLogitsMLUGradKernel<float>,
-    ops::SigmoidCrossEntropyWithLogitsMLUGradKernel<plat::float16>);
--- a/paddle/fluid/operators/size_op_mlu.cc
+++ b/paddle/fluid/operators/size_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class SizeMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("Input");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<int64_t>(ctx.GetPlace());
-    int64_t size = x->numel();
-    FillMLUTensorWithHostValue<int64_t>(ctx, size, out);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(size,
-                       ops::SizeMLUKernel<int>,
-                       ops::SizeMLUKernel<int64_t>,
-                       ops::SizeMLUKernel<paddle::platform::float16>,
-                       ops::SizeMLUKernel<float>,
-                       ops::SizeMLUKernel<double>,
-                       ops::SizeMLUKernel<bool>);
--- a/paddle/fluid/operators/slice_op_mlu.cc
+++ b/paddle/fluid/operators/slice_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/slice_utils.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class SliceMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto axes = ctx.Attr<std::vector<int>>("axes");
-    auto starts = ctx.Attr<std::vector<int>>("starts");
-    auto ends = ctx.Attr<std::vector<int>>("ends");
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
-    // Get the accurate attribute value of starts and ends
-    auto starts_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    if (ctx.HasInput("StartsTensor")) {
-      starts = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("StartsTensor"));
-    } else if (starts_tensor_list.size() > 0) {
-      starts = GetDataFromTensorList<int>(starts_tensor_list);
-    }
-    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    if (ctx.HasInput("EndsTensor")) {
-      ends = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("EndsTensor"));
-    } else if (ends_tensor_list.size() > 0) {
-      ends = GetDataFromTensorList<int>(ends_tensor_list);
-    }
-    PADDLE_ENFORCE_EQ(
-        starts.size(),
-        axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of starts must be equal to the size of axes."));
-    PADDLE_ENFORCE_EQ(
-        ends.size(),
-        axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of ends must be equal to the size of axes."));
-    const auto& in_dims = input->dims();
-    auto slice_dims = out->dims();
-    bool reset_slice_dims = false;
-    if (ctx.HasInput("StartsTensor") || ctx.HasInput("EndsTensor") ||
-        starts_tensor_list.size() > 0 || ends_tensor_list.size() > 0) {
-      // Infer output dims
-      for (size_t i = 0; i < axes.size(); ++i) {
-        // when start == -1 && end == start+1
-        if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
-          auto ret =
-              std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
-          if (ret != decrease_axis.end()) {
-            ends[i] = in_dims[axes[i]];
-          }
-        }
-      }
-      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
-      slice_dims = phi::funcs::GetSliceDims<int>(
-          in_dims, axes, starts, ends, nullptr, nullptr);
-      reset_slice_dims = true;
-      auto out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis);
-      out->Resize(out_dims);
-    }
-    if (slice_dims.size() != in_dims.size() && !reset_slice_dims) {
-      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
-      slice_dims = phi::funcs::GetSliceDims<int>(
-          in_dims, axes, starts, ends, nullptr, nullptr);
-    }
-    int in_dim_size = input->dims().size();
-    if (static_cast<int>(axes.size()) != in_dim_size) {
-      std::vector<int> tmp_starts(in_dim_size, 0);
-      const auto& in_dims_vec = phi::vectorize(input->dims());
-      std::vector<int> tmp_ends(in_dims_vec.begin(), in_dims_vec.end());
-      for (size_t i = 0; i < axes.size(); ++i) {
-        tmp_starts[axes[i]] = starts[i];
-        tmp_ends[axes[i]] = ends[i];
-      }
-      starts.swap(tmp_starts);
-      ends.swap(tmp_ends);
-    }
-    std::vector<int> strides(in_dim_size, 1);
-    out->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc out_desc(slice_dims.size(),
-                               phi::vectorize(slice_dims).data(),
-                               ToCnnlDataType<T>());
-    MLUCnnl::StridedSlice(ctx,
-                          starts.data(),
-                          ends.data(),
-                          strides.data(),
-                          input_desc.get(),
-                          GetBasePtr(input),
-                          out_desc.get(),
-                          GetBasePtr(out));
-  }
-};
-template <typename T>
-class SliceGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dinput =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto axes = ctx.Attr<std::vector<int>>("axes");
-    auto starts = ctx.Attr<std::vector<int>>("starts");
-    auto ends = ctx.Attr<std::vector<int>>("ends");
-    // Get the accurate attribute value of starts and ends
-    auto starts_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    if (ctx.HasInput("StartsTensor")) {
-      starts = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("StartsTensor"));
-    } else if (starts_tensor_list.size() > 0) {
-      starts = GetDataFromTensorList<int>(starts_tensor_list);
-    }
-    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    if (ctx.HasInput("EndsTensor")) {
-      ends = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("EndsTensor"));
-    } else if (ends_tensor_list.size() > 0) {
-      ends = GetDataFromTensorList<int>(ends_tensor_list);
-    }
-    const auto& in_dims = input->dims();
-    auto slice_dims = dout->dims();
-    if (slice_dims.size() != in_dims.size()) {
-      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
-      slice_dims = phi::funcs::GetSliceDims<int>(
-          in_dims, axes, starts, ends, nullptr, nullptr);
-    }
-    int in_dim_size = input->dims().size();
-    if (static_cast<int>(axes.size()) != in_dim_size) {
-      std::vector<int> tmp_starts(in_dim_size, 0);
-      const auto& in_dims_vec = phi::vectorize(input->dims());
-      std::vector<int> tmp_ends(in_dims_vec.begin(), in_dims_vec.end());
-      for (size_t i = 0; i < axes.size(); ++i) {
-        tmp_starts[axes[i]] = starts[i];
-        tmp_ends[axes[i]] = ends[i];
-      }
-      starts.swap(tmp_starts);
-      ends.swap(tmp_ends);
-    }
-    std::vector<int> strides(in_dim_size, 1);
-    dinput->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc dout_desc(slice_dims.size(),
-                                phi::vectorize(slice_dims).data(),
-                                ToCnnlDataType<T>());
-    MLUCnnlTensorDesc dinput_desc(*dinput);
-    MLUCnnl::StridedSliceGrad(ctx,
-                              starts.data(),
-                              ends.data(),
-                              strides.data(),
-                              dout_desc.get(),
-                              GetBasePtr(dout),
-                              dinput_desc.get(),
-                              GetBasePtr(dinput));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(slice,
-                       ops::SliceMLUKernel<float>,
-                       ops::SliceMLUKernel<int>,
-                       ops::SliceMLUKernel<bool>,
-                       ops::SliceMLUKernel<int64_t>,
-                       ops::SliceMLUKernel<double>,
-                       ops::SliceMLUKernel<paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(slice_grad,
-                       ops::SliceGradMLUKernel<float>,
-                       ops::SliceGradMLUKernel<int>,
-                       ops::SliceGradMLUKernel<bool>,
-                       ops::SliceGradMLUKernel<int64_t>,
-                       ops::SliceGradMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/softmax_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/phi/kernels/funcs/axis_utils.h"
-namespace paddle {
-namespace operators {
-template <cnnlSoftmaxAlgorithm_t softmax_algo, typename T>
-class SoftmaxMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    const int rank = in->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    // cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3]
-    const int cnnl_softmax_dims = 3;
-    const int d1 = phi::funcs::SizeToAxis(axis, in->dims());
-    const int d2 = in->dims()[axis];
-    const int d3 = phi::funcs::SizeOutAxis(axis, in->dims());
-    // CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as
-    // possible.
-    cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
-    std::vector<int> regard_in_shape{d1, 1, d2};
-    if (d3 != 1) {
-      mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
-      regard_in_shape = {d1, d2, d3};
-    }
-    static const cnnlSoftmaxAlgorithm_t algo = softmax_algo;
-    MLUCnnlTensorDesc in_desc(
-        cnnl_softmax_dims, regard_in_shape.data(), ToCnnlDataType<T>());
-    MLUCnnl::SoftmaxForward(ctx,
-                            algo,
-                            mode,
-                            NULL,
-                            in_desc.get(),
-                            GetBasePtr(in),
-                            NULL,
-                            in_desc.get(),
-                            GetBasePtr(out));
-  }
-};
-template <cnnlSoftmaxAlgorithm_t softmax_algo, typename T>
-class SoftmaxGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(ctx.GetPlace());
-    const int rank = out->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    // cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3]
-    const int cnnl_softmax_dims = 3;
-    const int d1 = phi::funcs::SizeToAxis(axis, out->dims());
-    const int d2 = out->dims()[axis];
-    const int d3 = phi::funcs::SizeOutAxis(axis, out->dims());
-    // CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as
-    // possible.
-    cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
-    std::vector<int> regard_out_shape{d1, 1, d2};
-    if (d3 != 1) {
-      mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
-      regard_out_shape = {d1, d2, d3};
-    }
-    static const cnnlSoftmaxAlgorithm_t algo = softmax_algo;
-    MLUCnnlTensorDesc out_desc(
-        cnnl_softmax_dims, regard_out_shape.data(), ToCnnlDataType<T>());
-    MLUCnnl::SoftmaxBackward(ctx,
-                             algo,
-                             mode,
-                             out_desc.get(),
-                             GetBasePtr(out),
-                             out_desc.get(),
-                             GetBasePtr(dOut),
-                             out_desc.get(),
-                             GetBasePtr(dX));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(
-    softmax,
-    ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, float>,
-    ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, plat::float16>);
-REGISTER_OP_MLU_KERNEL(softmax_grad,
-                       ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_ACCURATE, float>,
-                       ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_ACCURATE,
-                                                 paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(log_softmax,
-                       ops::SoftmaxMLUKernel<CNNL_SOFTMAX_LOG, float>,
-                       ops::SoftmaxMLUKernel<CNNL_SOFTMAX_LOG, plat::float16>);
-REGISTER_OP_MLU_KERNEL(
-    log_softmax_grad,
-    ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_LOG, float>,
-    ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_LOG, paddle::platform::float16>);
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/phi/kernels/funcs/axis_utils.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* logits = ctx.Input<phi::DenseTensor>("Logits");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* softmax = ctx.Output<phi::DenseTensor>("Softmax");
-    auto* loss = ctx.Output<phi::DenseTensor>("Loss");
-    auto* backprop = ctx.Output<phi::DenseTensor>("Backprop");
-    auto soft_label = ctx.Attr<bool>("soft_label");
-    PADDLE_ENFORCE_EQ(ctx.Attr<bool>("use_softmax"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "use_softmax=False is not supported in "
-                          "the mlu kernel of softmax_with_cross_entropy."));
-    const int rank = logits->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    loss->mutable_data<T>(ctx.GetPlace());
-    backprop->mutable_data<T>(ctx.GetPlace());
-    softmax->mutable_data<T>(ctx.GetPlace());
-    // cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3]
-    const int cnnl_softmax_dims = 3;
-    const int d1 = phi::funcs::SizeToAxis(axis, logits->dims());
-    const int d2_logits = logits->dims()[axis];
-    const int d2_labels = labels->dims()[axis];
-    const int d3 = phi::funcs::SizeOutAxis(axis, logits->dims());
-    // CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as
-    // possible.
-    cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
-    std::vector<int> regard_logits_shape{d1, 1, d2_logits};
-    std::vector<int> regard_labels_shape{d1, 1, d2_labels};
-    std::vector<int> regard_loss_shape{d1, 1, 1};
-    if (d3 != 1) {
-      mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
-      regard_logits_shape = {d1, d2_logits, d3};
-      regard_labels_shape = {d1, d2_labels, d3};
-      regard_loss_shape = {d1, 1, d3};
-    }
-    MLUCnnlTensorDesc logits_desc(
-        cnnl_softmax_dims, regard_logits_shape.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc labels_desc(
-        cnnl_softmax_dims, regard_labels_shape.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc loss_desc(
-        cnnl_softmax_dims, regard_loss_shape.data(), ToCnnlDataType<T>());
-    const cnnlSoftmaxAlgorithm_t algo = CNNL_SOFTMAX_ACCURATE;
-    MLUCnnl::SoftmaxForward(ctx,
-                            algo,
-                            mode,
-                            NULL,
-                            logits_desc.get(),
-                            GetBasePtr(logits),
-                            NULL,
-                            logits_desc.get(),
-                            GetBasePtr(softmax));
-    if (soft_label) {
-      const cnnlComputationPreference_t prefer =
-          CNNL_COMPUTATION_HIGH_PRECISION;
-      MLUCnnl::SoftmaxCrossEntropyWithLogits(ctx,
-                                             mode,
-                                             prefer,
-                                             logits_desc.get(),
-                                             GetBasePtr(logits),
-                                             labels_desc.get(),
-                                             GetBasePtr(labels),
-                                             loss_desc.get(),
-                                             GetBasePtr(loss),
-                                             logits_desc.get(),
-                                             GetBasePtr(backprop));
-    } else {
-      PADDLE_ENFORCE_EQ(d3,
-                        1,
-                        platform::errors::InvalidArgument(
-                            "If soft_label=False, axis must be -1 or"
-                            " can be regard as last dimention in mlu kernel."));
-      phi::DenseTensor labels_int32(framework::TransToPhiDataType(VT::INT32));
-      labels_int32.Resize(labels->dims());
-      labels_int32.mutable_data<int32_t>(ctx.GetPlace());
-      MLUCnnlTensorDesc labels_int64_desc(*labels);
-      MLUCnnlTensorDesc labels_int32_desc(labels_int32);
-      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    labels_int64_desc.get(),
-                    GetBasePtr(labels),
-                    labels_int32_desc.get(),
-                    GetBasePtr(&labels_int32));
-      const int regard_sparse_shape[cnnl_softmax_dims - 1] = {d1, 1};
-      MLUCnnlTensorDesc sparse_labels_desc(cnnl_softmax_dims - 1,
-                                           regard_sparse_shape,
-                                           ToCnnlDataType<int32_t>());
-      MLUCnnlTensorDesc sparse_loss_desc(
-          cnnl_softmax_dims - 1, regard_sparse_shape, ToCnnlDataType<T>());
-      MLUCnnl::SparseSoftmaxXentWithLogits(ctx,
-                                           mode,
-                                           logits_desc.get(),
-                                           GetBasePtr(logits),
-                                           sparse_labels_desc.get(),
-                                           GetBasePtr(&labels_int32),
-                                           sparse_loss_desc.get(),
-                                           GetBasePtr(loss),
-                                           logits_desc.get(),
-                                           GetBasePtr(backprop));
-    }
-  }
-};
-template <typename T>
-class SoftmaxWithCrossEntropyGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* backprop = ctx.Input<phi::DenseTensor>("Backprop");
-    auto* loss_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
-    auto* logits_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
-    PADDLE_ENFORCE_NOT_NULL(backprop,
-                            platform::errors::PreconditionNotMet(
-                                "backprop should not be null in MLU kernel of "
-                                "softmax_with_cross_entropy_grad."));
-    logits_grad->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlOpTensorDesc mul_op_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnlTensorDesc backprop_desc(*backprop);
-    MLUCnnlTensorDesc loss_grad_desc(*loss_grad);
-    MLUCnnlTensorDesc logits_grad_desc(*logits_grad);
-    MLUCnnl::OpTensor(ctx,
-                      mul_op_desc.get(),
-                      backprop_desc.get(),
-                      GetBasePtr(backprop),
-                      loss_grad_desc.get(),
-                      GetBasePtr(loss_grad),
-                      logits_grad_desc.get(),
-                      GetBasePtr(logits_grad),
-                      ToCnnlDataType<T>());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(
-    softmax_with_cross_entropy,
-    ops::SoftmaxWithCrossEntropyMLUKernel<float>,
-    ops::SoftmaxWithCrossEntropyMLUKernel<paddle::platform::float16>);
-REGISTER_OP_MLU_KERNEL(
-    softmax_with_cross_entropy_grad,
-    ops::SoftmaxWithCrossEntropyGradMLUKernel<float>,
-    ops::SoftmaxWithCrossEntropyGradMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/split_op_mlu.cc
+++ b/paddle/fluid/operators/split_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/split_op.h"
-#include "paddle/phi/core/tensor_utils.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class SplitMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // init parameter
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    int num = ctx.Attr<int>("num");
-    std::vector<int> sections = ctx.Attr<std::vector<int>>("sections");
-    int axis = ctx.Attr<int>("axis");
-    auto in_dims = in->dims();
-    auto out_size = outs.size();
-    auto num_tensor = num == 0 ? out_size : num;
-    bool need_resize_outs_dims = false;
-    if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
-      axis = phi::GetVectorFromTensor(axis_tensor)[0];
-      need_resize_outs_dims = true;
-    }
-    auto sections_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("SectionsTensorList");
-    if (sections_tensor_list.size() > 0) {
-      sections = GetDataFromTensorList(sections_tensor_list);
-      need_resize_outs_dims = true;
-    }
-    if (need_resize_outs_dims) {
-      std::vector<framework::DDim> outs_dims =
-          UpdateOutsDims(true, true, in_dims, num, sections, axis, out_size);
-      for (size_t j = 0; j < outs.size(); ++j) {
-        outs[j]->Resize(outs_dims[j]);
-      }
-    }
-    // init out tensors
-    std::vector<void*> vct_tensor;
-    std::vector<MLUCnnlTensorDesc> output_descs;
-    std::vector<cnnlTensorDescriptor_t> desc_vector;
-    for (size_t i = 0; i < outs.size(); i++) {
-      outs[i]->mutable_data<T>(ctx.GetPlace());
-      output_descs.emplace_back(MLUCnnlTensorDesc(
-          *outs[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(outs[i]->dtype())));
-      desc_vector.push_back(output_descs.back().get());
-      vct_tensor.push_back(GetBasePtr(outs[i]));
-    }
-    // init in tensors
-    MLUCnnlTensorDesc input_desc(
-        *in, CNNL_LAYOUT_ARRAY, ToCnnlDataType(in->dtype()));
-    // MLU should do sth
-    MLUCnnl::Split(ctx,
-                   num_tensor,
-                   axis,
-                   input_desc.get(),
-                   GetBasePtr(in),
-                   desc_vector.data(),
-                   vct_tensor.data());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(split,
-                       ops::SplitMLUKernel<float>,
-                       ops::SplitMLUKernel<int64_t>,
-                       ops::SplitMLUKernel<int>,
-                       ops::SplitMLUKernel<bool>,
-                       ops::SplitMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class SquaredL2NormMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &dev_ctx = context.template device_context<MLUDeviceContext>();
-    auto *x = context.Input<phi::DenseTensor>("X");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-    auto place = context.GetPlace();
-    out->mutable_data<T>(place);
-    MLUCnnlTensorDesc input_desc(*x);
-    MLUCnnlTensorDesc out_desc(*out);
-    // L2Loss
-    MLUCnnl::L2Loss(context, input_desc.get(), GetBasePtr(x), GetBasePtr(out));
-    // do mul
-    phi::DenseTensor scale_tensor =
-        context.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    phi::DenseTensor bias_tensor =
-        context.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    MLUCnnlTensorDesc scale_desc(scale_tensor);
-    MLUCnnlTensorDesc bias_desc(bias_tensor);
-    FillMLUTensorWithHostValue(context, static_cast<T>(2.0f), &scale_tensor);
-    FillMLUTensorWithHostValue(context, static_cast<T>(0.0f), &bias_tensor);
-    MLUCnnl::Scale(context,
-                   0,
-                   out_desc.get(),
-                   GetBasePtr(out),
-                   scale_desc.get(),
-                   GetBasePtr(&scale_tensor),
-                   bias_desc.get(),
-                   GetBasePtr(&bias_tensor),
-                   out_desc.get(),
-                   GetBasePtr(out));
-  }
-};
-template <typename T>
-class SquaredL2NormGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &dev_ctx = context.template device_context<MLUDeviceContext>();
-    auto *x = context.Input<phi::DenseTensor>("X");
-    auto *x_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *out_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(
-        out_grad->numel(),
-        1,
-        platform::errors::InvalidArgument(
-            "Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar."));
-    auto place = context.GetPlace();
-    // broadcast out_grad
-    phi::DenseTensor broadcasted_out_grad;
-    broadcasted_out_grad.mutable_data<T>(x_grad->dims(), place);
-    MLUCnnlTensorDesc broadcasted_out_grad_desc(broadcasted_out_grad);
-    MLUCnnlTensorDesc out_grad_desc(*out_grad);
-    MLUCnnl::BroadcastTo(context,
-                         out_grad_desc.get(),
-                         GetBasePtr(out_grad),
-                         broadcasted_out_grad_desc.get(),
-                         GetBasePtr(&broadcasted_out_grad));
-    // mul x
-    phi::DenseTensor tmp_x_grad;
-    tmp_x_grad.mutable_data<T>(x_grad->dims(), place);
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc tmp_x_grad_desc(tmp_x_grad);
-    MLUCnnlOpTensorDesc mul_op_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType(x->dtype()), CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(context,
-                      mul_op_desc.get(),
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      broadcasted_out_grad_desc.get(),
-                      GetBasePtr(&broadcasted_out_grad),
-                      tmp_x_grad_desc.get(),
-                      GetBasePtr(&tmp_x_grad),
-                      ToCnnlDataType(x->dtype()));
-    // mul
-    phi::DenseTensor scale_tensor =
-        context.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    phi::DenseTensor bias_tensor =
-        context.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    MLUCnnlTensorDesc scale_desc(scale_tensor);
-    MLUCnnlTensorDesc bias_desc(bias_tensor);
-    FillMLUTensorWithHostValue(context, static_cast<T>(2.0f), &scale_tensor);
-    FillMLUTensorWithHostValue(context, static_cast<T>(0.0f), &bias_tensor);
-    x_grad->mutable_data<T>(place);
-    MLUCnnlTensorDesc x_grad_desc(*x_grad);
-    MLUCnnl::Scale(context,
-                   0,
-                   tmp_x_grad_desc.get(),
-                   GetBasePtr(&tmp_x_grad),
-                   scale_desc.get(),
-                   GetBasePtr(&scale_tensor),
-                   bias_desc.get(),
-                   GetBasePtr(&bias_tensor),
-                   x_grad_desc.get(),
-                   GetBasePtr(x_grad));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(squared_l2_norm,
-                       ops::SquaredL2NormMLUKernel<float>,
-                       ops::SquaredL2NormMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(squared_l2_norm_grad,
-                       ops::SquaredL2NormGradMLUKernel<float>,
-                       ops::SquaredL2NormGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/squeeze_op_mlu.cc
+++ b/paddle/fluid/operators/squeeze_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_WITH_MLU
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/squeeze_op.h"
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(
-    squeeze,
-    ops::SqueezeKernel<plat::MLUDeviceContext, float>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, double>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, bool>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, int>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, int8_t>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    squeeze_grad,
-    ops::SqueezeGradKernel<plat::MLUDeviceContext, float>,
-    ops::SqueezeGradKernel<plat::MLUDeviceContext, double>,
-    ops::SqueezeGradKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::SqueezeGradKernel<plat::MLUDeviceContext, bool>,
-    ops::SqueezeGradKernel<plat::MLUDeviceContext, int>,
-    ops::SqueezeGradKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::SqueezeGradKernel<plat::MLUDeviceContext, int8_t>,
-    ops::SqueezeGradKernel<plat::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    squeeze2,
-    ops::SqueezeKernel<plat::MLUDeviceContext, float>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, double>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, bool>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, int>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, int8_t>,
-    ops::SqueezeKernel<plat::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    squeeze2_grad,
-    ops::Squeeze2GradKernel<plat::MLUDeviceContext, float>,
-    ops::Squeeze2GradKernel<plat::MLUDeviceContext, double>,
-    ops::Squeeze2GradKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::Squeeze2GradKernel<plat::MLUDeviceContext, bool>,
-    ops::Squeeze2GradKernel<plat::MLUDeviceContext, int>,
-    ops::Squeeze2GradKernel<plat::MLUDeviceContext, uint8_t>,
-    ops::Squeeze2GradKernel<plat::MLUDeviceContext, int8_t>,
-    ops::Squeeze2GradKernel<plat::MLUDeviceContext, int64_t>);
-#endif
--- a/paddle/fluid/operators/stack_op_mlu.cc
+++ b/paddle/fluid/operators/stack_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class StackMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x = ctx.MultiInput<phi::DenseTensor>("X");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-    int num = static_cast<int>(x.size());
-    PADDLE_ENFORCE_GT(num,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "number of input phi::DenseTensor <= 0"));
-    std::vector<MLUCnnlTensorDesc> x_descs;
-    std::vector<cnnlTensorDescriptor_t> x_raw_descs;
-    std::vector<const void*> x_ptrs;
-    for (int i = 0; i < num; i++) {
-      if (x[i]->dims().size() != 0) {
-        std::vector<int64_t> in_dims = phi::vectorize(x[i]->dims());
-        in_dims.insert(in_dims.begin() + axis, 1);
-        x_descs.emplace_back(MLUCnnlTensorDesc(
-            in_dims.size(), in_dims.data(), ToCnnlDataType<T>()));
-      } else {
-        int input_dims = 1;
-        x_descs.emplace_back(
-            MLUCnnlTensorDesc(1, &input_dims, ToCnnlDataType<T>()));
-      }
-      x_raw_descs.push_back(x_descs.back().get());
-      x_ptrs.push_back(GetBasePtr(x[i]));
-    }
-    y->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnl::Concat(ctx,
-                    num,
-                    axis,
-                    x_raw_descs.data(),
-                    x_ptrs.data(),
-                    y_desc.get(),
-                    GetBasePtr(y));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_MLU_KERNEL(
-    stack,
-    paddle::operators::StackMLUKernel<int64_t>,
-    paddle::operators::StackMLUKernel<int>,
-    paddle::operators::StackMLUKernel<float>,
-    paddle::operators::StackMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/strided_slice_op_mlu.cc
+++ b/paddle/fluid/operators/strided_slice_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/strided_slice.h"
-namespace paddle {
-namespace operators {
-using Variable = framework::Variable;
-using LoDTensorArray = framework::LoDTensorArray;
-using DDim = framework::DDim;
-static void ProcessStridedSliceParams(
-    const std::vector<int>& axes,
-    const DDim& input_dims,
-    const std::vector<int64_t>& starts,
-    const std::vector<int64_t>& ends,
-    const std::vector<int64_t>& strides,
-    const std::vector<int>& infer_flags,
-    const std::vector<int>& decrease_axis,
-    std::vector<int>* starts_indices_vector,
-    std::vector<int>* ends_indices_vector,
-    std::vector<int>* strides_indices_vector) {
-  for (size_t axis = 0; axis < axes.size(); axis++) {
-    int64_t start = starts[axis];
-    int64_t end = ends[axis];
-    int64_t stride = strides[axis];
-    int axis_index = axes[axis];
-    int64_t dim_size = input_dims[axis_index];
-    bool decrease_axis_affect = false;
-    if (start == -1 && end == 0 && infer_flags[axis] == -1) {
-      auto ret =
-          std::find(decrease_axis.begin(), decrease_axis.end(), axis_index);
-      if (ret != decrease_axis.end()) {
-        decrease_axis_affect = true;
-      }
-    }
-    if (stride < 0) {
-      if (start < 0) {
-        start = std::max(start, -dim_size);
-      } else {
-        start = std::min(start, dim_size - 1) - dim_size;
-      }
-      if (end < 0) {
-        end = std::max(end, -dim_size - 1);
-      } else {
-        end = end - dim_size;
-      }
-    } else {
-      if (start < 0) {
-        start = std::max(start, -dim_size) + dim_size;
-      } else {
-        start = std::min(start, dim_size - 1);
-      }
-      if (end < 0) {
-        end = end + dim_size;
-      } else {
-        end = std::min(end, dim_size);
-      }
-    }
-    if (decrease_axis_affect) {
-      if (stride < 0) {
-        end = start - 1;
-      } else {
-        end = start + 1;
-      }
-    }
-    (*starts_indices_vector)[axis_index] = static_cast<int>(start);
-    (*ends_indices_vector)[axis_index] = static_cast<int>(end);
-    (*strides_indices_vector)[axis_index] = static_cast<int>(stride);
-  }
-}
-template <typename T>
-class StridedSliceMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Variable* input_var = ctx.InputVar("Input");
-    bool is_tensor_array = input_var->IsType<LoDTensorArray>();
-    PADDLE_ENFORCE_EQ(is_tensor_array,
-                      false,
-                      platform::errors::InvalidArgument(
-                          "phi::DenseTensor array as input is not supported."));
-    int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
-    switch (rank) {
-      case 1:
-        StridedSliceCompute<1>(ctx);
-        break;
-      case 2:
-        StridedSliceCompute<2>(ctx);
-        break;
-      case 3:
-        StridedSliceCompute<3>(ctx);
-        break;
-      case 4:
-        StridedSliceCompute<4>(ctx);
-        break;
-      case 5:
-        StridedSliceCompute<5>(ctx);
-        break;
-      case 6:
-        StridedSliceCompute<6>(ctx);
-        break;
-      case 7:
-        StridedSliceCompute<7>(ctx);
-        break;
-      case 8:
-        StridedSliceCompute<8>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of input is supported up to 8."));
-        break;
-    }
-  }
- private:
-  template <size_t D>
-  void StridedSliceCompute(const framework::ExecutionContext& ctx) const {
-    auto place = ctx.GetPlace();
-    auto in = ctx.Input<phi::DenseTensor>("Input");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    auto in_dims = in->dims();
-    // list<int>
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    auto ends_int = ctx.Attr<std::vector<int>>("ends");
-    auto strides_int = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-    std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
-    std::vector<int64_t> strides(strides_int.begin(), strides_int.end());
-    auto axes = ctx.Attr<std::vector<int>>("axes");
-    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-    // vector<phi::DenseTensor<int32>>
-    auto list_new_starts_tensor =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    auto list_new_ends_tensor =
-        ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    auto list_new_strides_tensor =
-        ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
-    // phi::DenseTensor<int32>
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<phi::DenseTensor>("StartsTensor");
-      starts = phi::GetVectorFromTensor<int64_t>(starts_tensor);
-    }
-    if (list_new_ends_tensor.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-    } else if (ctx.HasInput("EndsTensor")) {
-      auto* ends_tensor = ctx.Input<phi::DenseTensor>("EndsTensor");
-      ends = phi::GetVectorFromTensor<int64_t>(ends_tensor);
-    }
-    if (list_new_strides_tensor.size() > 0) {
-      strides = GetDataFromTensorList<int64_t>(list_new_strides_tensor);
-    } else if (ctx.HasInput("StridesTensor")) {
-      auto* strides_tensor = ctx.Input<phi::DenseTensor>("StridesTensor");
-      strides = phi::GetVectorFromTensor<int64_t>(strides_tensor);
-    }
-    // out dims calculation
-    std::vector<int64_t> out_dims_vector(in_dims.size(), -1);
-    phi::funcs::StridedSliceOutDims(starts,
-                                    ends,
-                                    strides,
-                                    axes,
-                                    infer_flags,
-                                    in_dims,
-                                    decrease_axis,
-                                    out_dims_vector.data(),
-                                    axes.size(),
-                                    false);
-    framework::DDim out_dims(phi::make_ddim(out_dims_vector));
-    // construct the starts_indices, ends_indices and strides_indices tensor for
-    // calling StridedSlice op
-    std::vector<int> starts_indices_vector(D, 0);
-    std::vector<int> ends_indices_vector(out_dims_vector.begin(),
-                                         out_dims_vector.end());
-    std::vector<int> strides_indices_vector(D, 1);
-    ProcessStridedSliceParams(axes,
-                              in_dims,
-                              starts,
-                              ends,
-                              strides,
-                              infer_flags,
-                              decrease_axis,
-                              &starts_indices_vector,
-                              &ends_indices_vector,
-                              &strides_indices_vector);
-    auto out_dims_origin = out_dims;
-    if (decrease_axis.size() > 0) {
-      std::vector<int64_t> new_out_shape;
-      for (size_t i = 0; i < decrease_axis.size(); ++i) {
-        PADDLE_ENFORCE_EQ(
-            out_dims[decrease_axis[i]],
-            1,
-            platform::errors::InvalidArgument(
-                "the size of decrease dimension should be 1, but received %d.",
-                out_dims[decrease_axis[i]]));
-        out_dims_origin[decrease_axis[i]] = 0;
-      }
-      for (int i = 0; i < out_dims_origin.size(); ++i) {
-        if (out_dims_origin[i] != 0) {
-          new_out_shape.push_back(out_dims_origin[i]);
-        }
-      }
-      if (new_out_shape.size() == 0) {
-        new_out_shape.push_back(1);
-      }
-      out_dims_origin = phi::make_ddim(new_out_shape);
-    }
-    out->Resize(out_dims_origin);
-    out->mutable_data<T>(place);
-    MLUCnnlTensorDesc in_desc(*in);
-    MLUCnnlTensorDesc out_desc(
-        out_dims_vector.size(), out_dims_vector.data(), ToCnnlDataType<T>());
-    MLUCnnl::StridedSlice(ctx,
-                          starts_indices_vector.data(),
-                          ends_indices_vector.data(),
-                          strides_indices_vector.data(),
-                          in_desc.get(),
-                          GetBasePtr(in),
-                          out_desc.get(),
-                          GetBasePtr(out));
-  }
-};
-template <typename T>
-class StridedSliceGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Variable* input_var = ctx.InputVar("Input");
-    bool is_tensor_array = input_var->IsType<LoDTensorArray>();
-    PADDLE_ENFORCE_EQ(is_tensor_array,
-                      false,
-                      platform::errors::InvalidArgument(
-                          "phi::DenseTensor array as input is not supported."));
-    int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
-    switch (rank) {
-      case 1:
-        StridedSliceGradCompute<1>(ctx);
-        break;
-      case 2:
-        StridedSliceGradCompute<2>(ctx);
-        break;
-      case 3:
-        StridedSliceGradCompute<3>(ctx);
-        break;
-      case 4:
-        StridedSliceGradCompute<4>(ctx);
-        break;
-      case 5:
-        StridedSliceGradCompute<5>(ctx);
-        break;
-      case 6:
-        StridedSliceGradCompute<6>(ctx);
-        break;
-      case 7:
-        StridedSliceGradCompute<7>(ctx);
-        break;
-      case 8:
-        StridedSliceGradCompute<8>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of input is supported up to 8."));
-        break;
-    }
-  }
- private:
-  template <size_t D>
-  void StridedSliceGradCompute(const framework::ExecutionContext& ctx) const {
-    auto place = ctx.GetPlace();
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto input_dims = input->dims();
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    dx->mutable_data<T>(input_dims, place);
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    auto ends_int = ctx.Attr<std::vector<int>>("ends");
-    auto strides_int = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-    std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
-    std::vector<int64_t> strides(strides_int.begin(), strides_int.end());
-    auto axes = ctx.Attr<std::vector<int>>("axes");
-    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-    auto list_new_ends_tensor =
-        ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    auto list_new_strides_tensor =
-        ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<phi::DenseTensor>("StartsTensor");
-      starts = phi::GetVectorFromTensor<int64_t>(starts_tensor);
-    }
-    if (list_new_ends_tensor.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-    } else if (ctx.HasInput("EndsTensor")) {
-      auto* ends_tensor = ctx.Input<phi::DenseTensor>("EndsTensor");
-      ends = phi::GetVectorFromTensor<int64_t>(ends_tensor);
-    }
-    if (list_new_strides_tensor.size() > 0) {
-      strides = GetDataFromTensorList<int64_t>(list_new_strides_tensor);
-    } else if (ctx.HasInput("StridesTensor")) {
-      auto* strides_tensor = ctx.Input<phi::DenseTensor>("StridesTensor");
-      strides = phi::GetVectorFromTensor<int64_t>(strides_tensor);
-    }
-    std::vector<int64_t> out_dims_vector(input_dims.size(), -1);
-    phi::funcs::StridedSliceOutDims(starts,
-                                    ends,
-                                    strides,
-                                    axes,
-                                    infer_flags,
-                                    input_dims,
-                                    decrease_axis,
-                                    out_dims_vector.data(),
-                                    axes.size(),
-                                    false);
-    std::vector<int> starts_indices_vector(D, 0);
-    std::vector<int> ends_indices_vector(out_dims_vector.begin(),
-                                         out_dims_vector.end());
-    std::vector<int> strides_indices_vector(D, 1);
-    ProcessStridedSliceParams(axes,
-                              input_dims,
-                              starts,
-                              ends,
-                              strides,
-                              infer_flags,
-                              decrease_axis,
-                              &starts_indices_vector,
-                              &ends_indices_vector,
-                              &strides_indices_vector);
-    MLUCnnlTensorDesc dout_desc(
-        out_dims_vector.size(), out_dims_vector.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc dx_desc(*input);
-    MLUCnnl::StridedSliceGrad(ctx,
-                              starts_indices_vector.data(),
-                              ends_indices_vector.data(),
-                              strides_indices_vector.data(),
-                              dout_desc.get(),
-                              GetBasePtr(dout),
-                              dx_desc.get(),
-                              GetBasePtr(dx));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(strided_slice,
-                       ops::StridedSliceMLUKernel<plat::float16>,
-                       ops::StridedSliceMLUKernel<bool>,
-                       ops::StridedSliceMLUKernel<int>,
-                       ops::StridedSliceMLUKernel<int64_t>,
-                       ops::StridedSliceMLUKernel<float>);
-REGISTER_OP_MLU_KERNEL(strided_slice_grad,
-                       ops::StridedSliceGradMLUKernel<plat::float16>,
-                       ops::StridedSliceGradMLUKernel<float>,
-                       ops::StridedSliceGradMLUKernel<bool>,
-                       ops::StridedSliceGradMLUKernel<int>,
-                       ops::StridedSliceGradMLUKernel<int64_t>);
--- a/paddle/fluid/operators/sum_op_mlu.cc
+++ b/paddle/fluid/operators/sum_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-using SelectedRows = phi::SelectedRows;
-template <typename DeviceContext, typename T>
-class SumMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto out_var = ctx.OutputVar("Out");
-    if (out_var->IsType<phi::DenseTensor>()) {
-      // init
-      auto *out = out_var->GetMutable<phi::DenseTensor>();
-      auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-      out->mutable_data<T>(ctx.GetPlace());
-      auto place = ctx.GetPlace();
-      int ins_size = static_cast<int>(ins.size());
-      if (ins_size == 1) {
-        framework::TensorCopy(*ins[0], place, out);
-        return;
-      }
-      // MLU shoul do sth
-      std::vector<const void *> inputs;
-      std::vector<MLUCnnlTensorDesc> input_descs;
-      std::vector<cnnlTensorDescriptor_t> desc_vector;
-      for (int i = 0; i < ins_size; i++) {
-        input_descs.emplace_back(MLUCnnlTensorDesc(
-            *ins[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(ins[i]->dtype())));
-        desc_vector.push_back(input_descs.back().get());
-        inputs.push_back(GetBasePtr(ins[i]));
-      }
-      // init out tensors
-      MLUCnnlTensorDesc output_desc(
-          *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-      uint32_t ins_size_t = static_cast<uint32_t>(ins_size);
-      MLUCnnl::AddN(ctx,
-                    ins_size_t,
-                    desc_vector.data(),
-                    inputs.data(),
-                    output_desc.get(),
-                    GetBasePtr(out));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) must be phi::DenseTensor or But got "
-          "unsupport type: %s.",
-          framework::ToTypeName(out_var->Type())));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(
-    sum,
-    ops::SumMLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::SumMLUKernel<paddle::platform::MLUDeviceContext,
-                      paddle::platform::float16>);
--- a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/platform/collective_helper.h"
-#if defined(PADDLE_WITH_CNCL)
-#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
-#endif
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-#define NO_USE_CNCL 0
-#define GET_LAYOUT_OFFSET 2
-static std::vector<cnnlTensorLayout_t> supported_input_layout = {
-    CNNL_LAYOUT_NC, CNNL_LAYOUT_NLC, CNNL_LAYOUT_NHWC, CNNL_LAYOUT_NDHWC};
-template <typename T>
-class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    float epsilon = ctx.Attr<float>("epsilon");
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout layout = phi::StringToDataLayout(layout_str);
-    PADDLE_ENFORCE_EQ(use_global_stats,
-                      false,
-                      platform::errors::InvalidArgument(
-                          "sync_batch_norm doesn't support "
-                          "to set use_global_stats True. Please use batch_norm "
-                          "in this case."));
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-    const auto *mean = ctx.Input<phi::DenseTensor>("Mean");
-    const auto *variance = ctx.Input<phi::DenseTensor>("Variance");
-    auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
-    auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
-    auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-    auto *y = ctx.Output<phi::DenseTensor>("Y");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(x_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The Input dim size should be larger than 1."));
-    PADDLE_ENFORCE_LE(x_dims.size(),
-                      5,
-                      platform::errors::InvalidArgument(
-                          "The Input dim size should be less than 6."));
-    int N, C, H, W, D;
-    phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
-    y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<MPDType>(ctx.GetPlace());
-    variance_out->mutable_data<MPDType>(ctx.GetPlace());
-    saved_mean->mutable_data<MPDType>(ctx.GetPlace());
-    saved_variance->mutable_data<MPDType>(ctx.GetPlace());
-    phi::DenseTensor trans_x;
-    phi::DenseTensor trans_y;
-    std::vector<int> forward_perm;
-    std::vector<int> backward_perm;
-    std::vector<int> trans_shape;
-    const bool need_transpose =
-        ((layout == DataLayout::kNCHW && x_dims.size() != 2) ||
-         x_dims.size() == 5);
-    if (need_transpose) {
-      SetMLUTransposePerm(
-          x_dims, layout, &forward_perm, &backward_perm, &trans_shape);
-      trans_x.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
-      trans_y.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
-      MLUCnnlTensorDesc desc_x(*x);
-      MLUCnnlTensorDesc desc_trans_x(
-          trans_shape.size(), trans_shape.data(), ToCnnlDataType(x->dtype()));
-      MLUCnnl::Transpose(ctx,
-                         forward_perm,
-                         x_dims.size(),
-                         desc_x.get(),
-                         GetBasePtr(x),
-                         desc_trans_x.get(),
-                         GetBasePtr(&trans_x));
-    } else {
-      trans_x = *x;
-      trans_y = *y;
-    }
-    MLUCnnlTensorDesc desc_trans(
-        trans_x,
-        supported_input_layout[x_dims.size() - GET_LAYOUT_OFFSET],
-        ToCnnlDataType<T>());
-    bool test_mode = is_test && (!trainable_stats);
-    if (test_mode) {  // inference
-      MLUCnnlTensorDesc desc_weight_bias_mean_var(*bias);
-      MLUCnnl::FusedBatchNorm(ctx,
-                              false /*is_training*/,
-                              desc_trans.get(),
-                              GetBasePtr(&trans_x),
-                              desc_weight_bias_mean_var.get(),
-                              GetBasePtr(scale),
-                              GetBasePtr(bias),
-                              GetBasePtr(mean),
-                              GetBasePtr(variance),
-                              epsilon,
-                              momentum,
-                              desc_trans.get(),
-                              GetBasePtr(&trans_y),
-                              nullptr,
-                              nullptr,
-                              nullptr,
-                              nullptr);
-    } else {  // training
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        phi::DenseTensor mom_cpu;
-        paddle::framework::TensorCopySync(
-            *mom_tensor, platform::CPUPlace(), &mom_cpu);
-        momentum = mom_cpu.data<float>()[0];
-      }
-      phi::DenseTensor local_mean, local_var;
-      local_mean.mutable_data<MPDType>(mean->dims(), ctx.GetPlace());
-      local_var.mutable_data<MPDType>(variance->dims(), ctx.GetPlace());
-      MLUCnnlTensorDesc desc_mean_var(*mean_out);
-      // cacl local_mean and local_var
-      MLUCnnl::SyncBatchNormStats(ctx,
-                                  desc_trans.get(),
-                                  GetBasePtr(&trans_x),
-                                  epsilon,
-                                  desc_mean_var.get(),
-                                  GetBasePtr(&local_mean),
-                                  desc_mean_var.get(),
-                                  GetBasePtr(&local_var));
-      phi::DenseTensor input_count;
-      input_count.mutable_data<MPDType>(phi::make_ddim({1}), ctx.GetPlace());
-      FillMLUTensorWithHostValue<MPDType>(
-          ctx, static_cast<MPDType>(x->numel() / C), &input_count);
-      phi::DenseTensor count_all;
-      phi::DenseTensor mean_all(mean->dtype());
-      phi::DenseTensor invstd_all(variance->dtype());
-#ifdef PADDLE_WITH_CNCL
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::MLUDeviceContext>();
-      auto *comm = dev_ctx.cncl_comm();
-      if (comm) {
-        auto cncl_comm = paddle::platform::CNCLCommContext::Instance().Get(
-            0, ctx.GetPlace());
-        auto *comm = cncl_comm->comm();
-        auto comm_stream = cncl_comm->stream();
-        int count;
-        PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCommCount(&count, comm));
-        count_all.mutable_data<MPDType>(phi::make_ddim({count}),
-                                        ctx.GetPlace());
-        mean_all.mutable_data<MPDType>(phi::make_ddim({count, mean->numel()}),
-                                       ctx.GetPlace());
-        invstd_all.mutable_data<MPDType>(
-            phi::make_ddim({count, variance->numel()}), ctx.GetPlace());
-        // before comm_stream exec, need sync compute_stream.
-        dev_ctx.Wait();
-        cnclDataType_t dtype = platform::ToCNCLDataType(
-            framework::TransToProtoVarType(count_all.dtype()));
-        PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(GetBasePtr(&input_count),
-                                                 GetBasePtr(&count_all),
-                                                 1,
-                                                 dtype,
-                                                 comm,
-                                                 comm_stream));
-        auto cncl_dtype = platform::ToCNCLDataType(
-            framework::TransToProtoVarType(mean_all.dtype()));
-        PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(GetBasePtr(&local_mean),
-                                                 GetBasePtr(&mean_all),
-                                                 local_mean.numel(),
-                                                 cncl_dtype,
-                                                 comm,
-                                                 comm_stream));
-        PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(GetBasePtr(&local_var),
-                                                 GetBasePtr(&invstd_all),
-                                                 local_var.numel(),
-                                                 cncl_dtype,
-                                                 comm,
-                                                 comm_stream));
-        // after comm_stream exec, need sync queue for using compute_stream
-        // correctly.
-        PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
-#else
-      if (NO_USE_CNCL) {
-#endif
-      } else {
-        count_all = input_count;
-        mean_all.ShareDataWith(local_mean);
-        invstd_all.ShareDataWith(local_var);
-        mean_all.Resize(phi::make_ddim({1, local_mean.numel()}));
-        invstd_all.Resize(phi::make_ddim({1, local_var.numel()}));
-      }
-      MLUCnnlTensorDesc desc_all_mean_invstd(
-          invstd_all, CNNL_LAYOUT_NC, ToCnnlDataType<MPDType>());
-      MLUCnnlTensorDesc desc_moving_mean_var(*mean_out);
-      MLUCnnlTensorDesc desc_saved_mean_var(*saved_mean);
-      MLUCnnlTensorDesc desc_count_all(count_all);
-      MLUCnnl::SyncBatchNormGatherStatsWithCounts(ctx,
-                                                  momentum,
-                                                  epsilon,
-                                                  desc_all_mean_invstd.get(),
-                                                  GetBasePtr(&mean_all),
-                                                  desc_all_mean_invstd.get(),
-                                                  GetBasePtr(&invstd_all),
-                                                  desc_moving_mean_var.get(),
-                                                  GetBasePtr(mean_out),
-                                                  desc_moving_mean_var.get(),
-                                                  GetBasePtr(variance_out),
-                                                  desc_count_all.get(),
-                                                  GetBasePtr(&count_all),
-                                                  desc_saved_mean_var.get(),
-                                                  GetBasePtr(saved_mean),
-                                                  desc_saved_mean_var.get(),
-                                                  GetBasePtr(saved_variance));
-      MLUCnnlTensorDesc desc_other_param(*saved_mean);
-      MLUCnnl::SyncBatchNormElemt(ctx,
-                                  desc_trans.get(),
-                                  GetBasePtr(&trans_x),
-                                  desc_other_param.get(),
-                                  GetBasePtr(saved_mean),
-                                  desc_other_param.get(),
-                                  GetBasePtr(saved_variance),
-                                  desc_other_param.get(),
-                                  GetBasePtr(scale),
-                                  desc_other_param.get(),
-                                  GetBasePtr(bias),
-                                  desc_trans.get(),
-                                  GetBasePtr(&trans_y));
-    }
-    if (need_transpose) {
-      MLUCnnlTensorDesc desc_y(*y);
-      MLUCnnlTensorDesc desc_trans_y(trans_y);
-      MLUCnnl::Transpose(ctx,
-                         backward_perm,
-                         trans_y.dims().size(),
-                         desc_trans_y.get(),
-                         GetBasePtr(&trans_y),
-                         desc_y.get(),
-                         GetBasePtr(y));
-    }
-  }
-};
-template <typename T>
-class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout layout = phi::StringToDataLayout(layout_str);
-    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-    // init output
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
-    const auto *saved_inv_var = ctx.Input<phi::DenseTensor>("SavedVariance");
-    const phi::DenseTensor *x;
-    if (ctx.HasInput("Y")) {
-      PADDLE_ENFORCE_EQ(true,
-                        false,
-                        platform::errors::InvalidArgument(
-                            "sync_batch_norm_grad doesn't support input Y"));
-    } else {
-      x = ctx.Input<phi::DenseTensor>("X");
-    }
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(x_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The Input X dim size should be larger than 1."));
-    PADDLE_ENFORCE_LE(x_dims.size(),
-                      5,
-                      platform::errors::InvalidArgument(
-                          "The Input X dim size should be less than 6."));
-    int N, C, H, W, D;
-    phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
-    PADDLE_ENFORCE_EQ(scale->dims()[0],
-                      C,
-                      platform::errors::InvalidArgument(
-                          "Expected first dim for input parameter(scale) of "
-                          "OP(sync_batch_norm) be (%d), but given (%d).",
-                          C,
-                          scale->dims()[0]));
-    d_x->mutable_data<T>(ctx.GetPlace());
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<MPDType>(ctx.GetPlace());
-      d_bias->mutable_data<MPDType>(ctx.GetPlace());
-    }
-    PADDLE_ENFORCE_EQ(scale->dims().size(),
-                      1UL,
-                      platform::errors::InvalidArgument(
-                          "Expected rank for input parameter(scale) of "
-                          "OP(sync_batch_norm) be (1), but given (%d).",
-                          scale->dims().size()));
-    phi::DenseTensor trans_x;
-    phi::DenseTensor trans_dy;
-    phi::DenseTensor trans_dx;
-    std::vector<int> forward_perm;
-    std::vector<int> backward_perm;
-    std::vector<int> trans_shape;
-    const bool need_transpose =
-        ((layout == DataLayout::kNCHW && x_dims.size() != 2) ||
-         x_dims.size() == 5);
-    if (need_transpose) {
-      SetMLUTransposePerm(
-          x_dims, layout, &forward_perm, &backward_perm, &trans_shape);
-      trans_x.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
-      trans_dy.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
-      trans_dx.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
-      MLUCnnlTensorDesc desc_x(*x);
-      MLUCnnlTensorDesc desc_trans_x(
-          trans_shape.size(), trans_shape.data(), ToCnnlDataType(x->dtype()));
-      MLUCnnl::Transpose(ctx,
-                         forward_perm,
-                         x_dims.size(),
-                         desc_x.get(),
-                         GetBasePtr(x),
-                         desc_trans_x.get(),
-                         GetBasePtr(&trans_x));
-      MLUCnnl::Transpose(ctx,
-                         forward_perm,
-                         x_dims.size(),
-                         desc_x.get(),
-                         GetBasePtr(d_y),
-                         desc_trans_x.get(),
-                         GetBasePtr(&trans_dy));
-    } else {
-      trans_x = *x;
-      trans_dy = *d_y;
-      trans_dx = *d_x;
-    }
-    MLUCnnlTensorDesc desc_trans(
-        trans_x,
-        supported_input_layout[x_dims.size() - GET_LAYOUT_OFFSET],
-        ToCnnlDataType<T>());
-    phi::DenseTensor sum_dy, sum_dy_xmu;
-    sum_dy.mutable_data<MPDType>(bias->dims(), ctx.GetPlace());
-    sum_dy_xmu.mutable_data<MPDType>(bias->dims(), ctx.GetPlace());
-    MLUCnnlTensorDesc desc_other_param(*bias);
-    MLUCnnl::SyncBatchnormBackwardReduce(
-        ctx,
-        desc_trans.get(),
-        GetBasePtr(&trans_dy),
-        desc_trans.get(),
-        GetBasePtr(&trans_x),
-        desc_other_param.get(),
-        GetBasePtr(saved_mean),
-        desc_other_param.get(),
-        GetBasePtr(saved_inv_var),
-        d_scale ? desc_other_param.get() : nullptr,
-        d_scale ? GetBasePtr(d_scale) : nullptr,
-        d_bias ? desc_other_param.get() : nullptr,
-        d_bias ? GetBasePtr(d_bias) : nullptr,
-        desc_other_param.get(),
-        GetBasePtr(&sum_dy),
-        desc_other_param.get(),
-        GetBasePtr(&sum_dy_xmu),
-        true /*compute sum_dy, sum_dy_xmu*/,
-        d_scale ? true : false /*compute d_scale*/,
-        d_bias ? true : false /*compute d_bias*/);
-    phi::DenseTensor numel_count;
-    numel_count.mutable_data<int32_t>(phi::make_ddim({1}), ctx.GetPlace());
-    FillMLUTensorWithHostValue<int32_t>(
-        ctx, static_cast<int32_t>(x->numel() / C), &numel_count);
-#ifdef PADDLE_WITH_CNCL
-    auto &dev_ctx =
-        ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    auto *comm = dev_ctx.cncl_comm();
-    if (comm) {
-      auto cncl_comm =
-          paddle::platform::CNCLCommContext::Instance().Get(0, ctx.GetPlace());
-      auto *comm = cncl_comm->comm();
-      auto comm_stream = cncl_comm->stream();
-      // before comm_stream exec, need sync compute_stream.
-      dev_ctx.Wait();
-      cnclDataType_t dtype = platform::ToCNCLDataType(
-          framework::TransToProtoVarType(numel_count.dtype()));
-      PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(GetBasePtr(&numel_count),
-                                               GetBasePtr(&numel_count),
-                                               1,
-                                               dtype,
-                                               cnclSum,
-                                               comm,
-                                               comm_stream));
-      auto cncl_dtype = platform::ToCNCLDataType(
-          framework::TransToProtoVarType(sum_dy.dtype()));
-      PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(GetBasePtr(&sum_dy),
-                                               GetBasePtr(&sum_dy),
-                                               sum_dy.numel(),
-                                               cncl_dtype,
-                                               cnclSum,
-                                               comm,
-                                               comm_stream));
-      PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(GetBasePtr(&sum_dy_xmu),
-                                               GetBasePtr(&sum_dy_xmu),
-                                               sum_dy_xmu.numel(),
-                                               cncl_dtype,
-                                               cnclSum,
-                                               comm,
-                                               comm_stream));
-      // after comm_stream exec, need sync queue for using compute_stream
-      // correctly.
-      PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
-    }
-#endif
-    if (d_x) {
-      MLUCnnlTensorDesc desc_count(numel_count);
-      MLUCnnl::SyncBatchNormBackwardElemt(ctx,
-                                          desc_trans.get(),
-                                          GetBasePtr(&trans_dy),
-                                          desc_trans.get(),
-                                          GetBasePtr(&trans_x),
-                                          desc_other_param.get(),
-                                          GetBasePtr(saved_mean),
-                                          desc_other_param.get(),
-                                          GetBasePtr(saved_inv_var),
-                                          desc_other_param.get(),
-                                          GetBasePtr(scale),
-                                          desc_other_param.get(),
-                                          GetBasePtr(&sum_dy),
-                                          desc_other_param.get(),
-                                          GetBasePtr(&sum_dy_xmu),
-                                          desc_count.get(),
-                                          GetBasePtr(&numel_count),
-                                          desc_trans.get(),
-                                          GetBasePtr(&trans_dx));
-      if (need_transpose) {
-        MLUCnnlTensorDesc desc_dx(*d_x);
-        MLUCnnlTensorDesc desc_trans_dx(trans_dx);
-        MLUCnnl::Transpose(ctx,
-                           backward_perm,
-                           trans_dx.dims().size(),
-                           desc_trans_dx.get(),
-                           GetBasePtr(&trans_dx),
-                           desc_dx.get(),
-                           GetBasePtr(d_x));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(sync_batch_norm,
-                       ops::SyncBatchNormMLUKernel<float>,
-                       ops::SyncBatchNormMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(sync_batch_norm_grad,
-                       ops::SyncBatchNormMLUGradKernel<float>,
-                       ops::SyncBatchNormMLUGradKernel<plat::float16>);
--- a/paddle/fluid/operators/tile_op_mlu.cc
+++ b/paddle/fluid/operators/tile_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/tile_op_functor.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class TileMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op must be a positive "
-            "integer, but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            rank));
-    auto repeat_times = get_repeat_times(context);
-    int repeat_times_size = repeat_times.size();
-    PADDLE_ENFORCE_GE(
-        repeat_times_size,
-        1,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile "
-            "op must be positive, but the value received is %d.",
-            repeat_times_size));
-    PADDLE_ENFORCE_LE(
-        repeat_times_size,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            repeat_times_size));
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto in_dims = in0->dims();
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          repeat_times[i],
-          0,
-          platform::errors::InvalidArgument(
-              "All elements of the input 'repeat_times' for tile op must "
-              "be positive integers, but the value received is %d.",
-              repeat_times[i]));
-    }
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    if (repeat_times.size() < vec_in_dims.size()) {
-      int diff = vec_in_dims.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, 1);
-    } else {
-      int diff = repeat_times.size() - vec_in_dims.size();
-      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    }
-    PADDLE_ENFORCE_EQ(
-        repeat_times.size(),
-        vec_in_dims.size(),
-        platform::errors::InvalidArgument(
-            "The rank (%d) of the input 'x' and the rank (%d) of the input "
-            "'repeat_times' for tile op must match after promotion.",
-            vec_in_dims.size(),
-            repeat_times.size()));
-    auto* out0 = context.Output<phi::DenseTensor>("Out");
-    bool repeat_one_times = true;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      if (repeat_times[i] != 1) {
-        repeat_one_times = false;
-      }
-    }
-    if (repeat_one_times) {
-      paddle::framework::TensorCopy(*in0, context.GetPlace(), out0);
-    } else {
-      framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-      framework::DDim out_dims(new_in_dims);
-      for (size_t i = 0; i < repeat_times.size(); ++i) {
-        out_dims[i] *= repeat_times[i];
-      }
-      out0->Resize(out_dims);
-      out0->mutable_data<T>(context.GetPlace());
-      MLUCnnlTensorDesc x_desc(*in0);
-      MLUCnnlTensorDesc out_desc(*out0);
-      MLUCnnl::BroadcastTo(context,
-                           x_desc.get(),
-                           GetBasePtr(in0),
-                           out_desc.get(),
-                           GetBasePtr(out0));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(tile,
-                       ops::TileMLUKernel<bool>,
-                       ops::TileMLUKernel<int>,
-                       ops::TileMLUKernel<int64_t>,
-                       ops::TileMLUKernel<float>);
-#endif
--- a/paddle/fluid/operators/top_k_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/top_k_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class TopkMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
-    const auto& place = ctx.GetPlace();
-    size_t k = static_cast<int>(ctx.Attr<int>("k"));
-    auto* k_t = ctx.Input<phi::DenseTensor>("K");
-    if (k_t) {
-      auto k_t_ptr = static_cast<const void*>(k_t->data<int>());
-      auto size = k_t->numel() * sizeof(int);
-      memory::Copy(platform::CPUPlace(),
-                   reinterpret_cast<void*>(&k),
-                   k_t->place(),
-                   k_t_ptr,
-                   size,
-                   nullptr);
-      framework::DDim output_dims = output->dims();
-      output_dims[output_dims.size() - 1] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-    output->mutable_data<T>(place);
-    indices->mutable_data<int64_t>(place);
-    const bool largest = true;
-    const bool sorted = true;
-    const int axis = -1;
-    // cnnl only support int32/int16 type of indices
-    phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
-    indices_int32.Resize(indices->dims());
-    indices_int32.mutable_data<int32_t>(place);
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc values_output_desc(*output);
-    MLUCnnlTensorDesc indices_int32_desc(indices_int32);
-    MLUCnnl::TopK(ctx,
-                  k,
-                  axis,
-                  largest,
-                  sorted,
-                  input_desc.get(),
-                  GetBasePtr(input),
-                  values_output_desc.get(),
-                  GetBasePtr(output),
-                  indices_int32_desc.get(),
-                  GetBasePtr(&indices_int32));
-    // cast indices type to int64
-    MLUCnnlTensorDesc cast_output_desc(*indices);
-    cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
-    MLUCnnl::Cast(ctx,
-                  cast_type,
-                  indices_int32_desc.get(),
-                  GetBasePtr(&indices_int32),
-                  cast_output_desc.get(),
-                  GetBasePtr(indices));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(top_k,
-                       ops::TopkMLUKernel<float>,
-                       ops::TopkMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/top_k_v2_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class TopkV2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
-    const auto& place = ctx.GetPlace();
-    const auto& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
-    const auto& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    if (axis < 0) {
-      const auto& in_dims = input->dims();
-      axis += in_dims.size();
-    }
-    size_t k = static_cast<int>(ctx.Attr<int>("k"));
-    auto* k_t = ctx.Input<phi::DenseTensor>("K");
-    if (k_t) {
-      auto k_t_ptr = static_cast<const void*>(k_t->data<int>());
-      auto size = k_t->numel() * sizeof(int);
-      memory::Copy(platform::CPUPlace(),
-                   reinterpret_cast<void*>(&k),
-                   k_t->place(),
-                   k_t_ptr,
-                   size,
-                   nullptr);
-      framework::DDim output_dims = output->dims();
-      // accroding to axis to set K value in the dim
-      output_dims[axis] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-    output->mutable_data<T>(place);
-    indices->mutable_data<int64_t>(place);
-    // cnnl only support int32/int16 type of indices
-    phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
-    indices_int32.Resize(indices->dims());
-    indices_int32.mutable_data<int32_t>(place);
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc values_output_desc(*output);
-    MLUCnnlTensorDesc indices_int32_desc(indices_int32);
-    MLUCnnl::TopK(ctx,
-                  k,
-                  axis,
-                  largest,
-                  sorted,
-                  input_desc.get(),
-                  GetBasePtr(input),
-                  values_output_desc.get(),
-                  GetBasePtr(output),
-                  indices_int32_desc.get(),
-                  GetBasePtr(&indices_int32));
-    // cast indices type to int64
-    MLUCnnlTensorDesc cast_output_desc(*indices);
-    cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
-    MLUCnnl::Cast(ctx,
-                  cast_type,
-                  indices_int32_desc.get(),
-                  GetBasePtr(&indices_int32),
-                  cast_output_desc.get(),
-                  GetBasePtr(indices));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(top_k_v2,
-                       ops::TopkV2MLUKernel<float>,
-                       ops::TopkV2MLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/transpose_op_mlu.cc
+++ b/paddle/fluid/operators/transpose_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class TransposeMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-    out->mutable_data<T>(ctx.device_context().GetPlace());
-    TransposeFromMLUTensor<T>(
-        ctx, axis, x, out, false /*need_reshape_or_alloc*/);
-  }
-};
-template <typename T>
-class TransposeGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-    x_grad->mutable_data<T>(ctx.GetPlace());
-    TransposeFromMLUTensor<T>(
-        ctx, reversed_axis, out_grad, x_grad, false /*need_reshape_or_alloc*/);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(transpose2,
-                       ops::TransposeMLUKernel<float>,
-                       ops::TransposeMLUKernel<paddle::platform::float16>,
-                       ops::TransposeMLUKernel<int>,
-                       ops::TransposeMLUKernel<int16_t>,
-                       ops::TransposeMLUKernel<uint8_t>,
-                       ops::TransposeMLUKernel<int8_t>,
-                       ops::TransposeMLUKernel<bool>);
-REGISTER_OP_MLU_KERNEL(transpose2_grad,
-                       ops::TransposeGradMLUKernel<float>,
-                       ops::TransposeGradMLUKernel<paddle::platform::float16>,
-                       ops::TransposeGradMLUKernel<int>,
-                       ops::TransposeGradMLUKernel<int16_t>,
-                       ops::TransposeGradMLUKernel<uint8_t>,
-                       ops::TransposeGradMLUKernel<int8_t>,
-                       ops::TransposeGradMLUKernel<bool>);
--- a/paddle/fluid/operators/tril_triu_op_mlu.cc
+++ b/paddle/fluid/operators/tril_triu_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class TrilTriuMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int diagonal = ctx.Attr<int>("diagonal");
-    bool lower = ctx.Attr<bool>("lower");
-    bool upper;
-    if (lower) {
-      upper = 0;
-    } else {
-      upper = 1;
-    }
-    out->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::TrilTriu(ctx,
-                      diagonal,
-                      upper,
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      out_desc.get(),
-                      GetBasePtr(out));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(tril_triu,
-                       ops::TrilTriuMLUKernel<float>,
-                       ops::TrilTriuMLUKernel<int32_t>,
-                       ops::TrilTriuMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/truncated_gaussian_random_op_mlu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <limits>
-#include <random>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
-#include "paddle/phi/core/generator.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class TruncatedGaussianRandomMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<phi::DenseTensor>("Out");
-    tensor->mutable_data<T>(context.GetPlace());
-    phi::DenseTensor cpu_tensor(tensor->dtype());
-    cpu_tensor.Resize(tensor->dims());
-    T* data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
-                                           1.0);
-    TruncatedNormal<T> truncated_normal(mean, std);
-    int64_t size = tensor->numel();
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = phi::GetCPURandomEngine(seed);
-    for (int64_t i = 0; i < size; ++i) {
-      data_cpu[i] = truncated_normal(dist(*engine));
-    }
-    auto& dev_ctx =
-        context.template device_context<platform::MLUDeviceContext>();
-    framework::TensorCopy(cpu_tensor, context.GetPlace(), dev_ctx, tensor);
-    dev_ctx.Wait();
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(truncated_gaussian_random,
-                       ops::TruncatedGaussianRandomMLUKernel<float>);
--- a/paddle/fluid/operators/uniform_random_op_mlu.cc
+++ b/paddle/fluid/operators/uniform_random_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/uniform_random_op.h"
-#include "paddle/phi/core/generator.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class MLUUniformRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    phi::DenseTensor *tensor = nullptr;
-    auto out_var = ctx.OutputVar("Out");
-    std::vector<int64_t> new_shape;
-    auto list_new_shape_tensor =
-        ctx.MultiInput<phi::DenseTensor>("ShapeTensorList");
-    if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) {
-      if (ctx.HasInput("ShapeTensor")) {
-        auto *shape_tensor = ctx.Input<phi::DenseTensor>("ShapeTensor");
-        new_shape = GetNewDataFromShapeTensor(shape_tensor);
-      } else if (list_new_shape_tensor.size() > 0) {
-        new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
-      }
-    }
-    if (out_var->IsType<phi::SelectedRows>()) {
-      auto *selected_rows = out_var->GetMutable<phi::SelectedRows>();
-      tensor = selected_rows->mutable_value();
-      auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-      if (!new_shape.empty()) shape = new_shape;
-      tensor->Resize(phi::make_ddim(shape));
-      selected_rows->mutable_rows()->reserve(shape[0]);
-    } else if (out_var->IsType<phi::DenseTensor>()) {
-      tensor = out_var->GetMutable<phi::DenseTensor>();
-      if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) in uniform_random_op must be "
-          "phi::DenseTensor, "
-          "SelectedRows. But got "
-          "unsupport type: %s.",
-          framework::ToTypeName(out_var->Type())));
-    }
-    tensor->mutable_data<T>(ctx.GetPlace());
-    int64_t size = tensor->numel();
-    phi::DenseTensor cpu_tensor(tensor->dtype());
-    cpu_tensor.Resize(tensor->dims());
-    T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-    std::uniform_real_distribution<T> dist(
-        static_cast<T>(ctx.Attr<float>("min")),
-        static_cast<T>(ctx.Attr<float>("max")));
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    auto engine = phi::GetCPURandomEngine(seed);
-    for (int64_t i = 0; i < size; ++i) {
-      data_cpu[i] = dist(*engine);
-    }
-    unsigned int diag_num =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
-    auto diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
-    if (diag_num > 0) {
-      PADDLE_ENFORCE_GT(
-          size,
-          (diag_num - 1) * (diag_step + 1),
-          platform::errors::InvalidArgument(
-              "ShapeInvalid: the diagonal's elements is equal (num-1) "
-              "* (step-1) with num %d, step %d,"
-              "It should be smaller than %d, but received %d",
-              diag_num,
-              diag_step,
-              (diag_num - 1) * (diag_step + 1),
-              size));
-      for (int64_t i = 0; i < diag_num; ++i) {
-        int64_t pos = i * diag_step + i;
-        data_cpu[pos] = diag_val;
-      }
-    }
-    // copy to MLU
-    framework::TensorCopy(
-        cpu_tensor,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        tensor);
-    ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_MLU_KERNEL(uniform_random,
-                       paddle::operators::MLUUniformRandomKernel<float>);
--- a/paddle/fluid/operators/unsqueeze_op_mlu.cc
+++ b/paddle/fluid/operators/unsqueeze_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_WITH_MLU
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/unsqueeze_op.h"
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(
-    unsqueeze,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, float>,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, double>,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, bool>,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, int>,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, int8_t>,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    unsqueeze2,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, float>,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, double>,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, bool>,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, int>,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, int8_t>,
-    ops::UnsqueezeKernel<plat::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    unsqueeze_grad,
-    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, float>,
-    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, double>,
-    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, bool>,
-    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, int>,
-    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, int8_t>,
-    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    unsqueeze2_grad,
-    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, float>,
-    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, double>,
-    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, plat::float16>,
-    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, bool>,
-    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, int>,
-    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, int8_t>,
-    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, int64_t>);
-#endif
--- a/paddle/fluid/operators/unstack_op_mlu.cc
+++ b/paddle/fluid/operators/unstack_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class UnStackMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.MultiOutput<phi::DenseTensor>("Y");
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += x->dims().size();
-    int num = x->dims()[axis];
-    std::vector<MLUCnnlTensorDesc> out_descs;
-    std::vector<cnnlTensorDescriptor_t> out_raw_descs;
-    std::vector<void *> out_ptrs;
-    std::vector<int64_t> new_dims = phi::vectorize(x->dims());
-    new_dims[axis] = 1;
-    for (int i = 0; i < num; i++) {
-      out[i]->mutable_data<T>(ctx.GetPlace());
-      out_descs.emplace_back(MLUCnnlTensorDesc(
-          new_dims.size(), new_dims.data(), ToCnnlDataType<T>()));
-      out_raw_descs.push_back(out_descs.back().get());
-      out_ptrs.push_back(GetBasePtr(out[i]));
-    }
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnl::Split(ctx,
-                   num,
-                   axis,
-                   x_desc.get(),
-                   GetBasePtr(x),
-                   out_raw_descs.data(),
-                   out_ptrs.data());
-  }
-};
-template <typename T>
-class UnStackGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto x = ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto *y = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-    int num = static_cast<int>(x.size());
-    std::vector<MLUCnnlTensorDesc> x_descs;
-    std::vector<cnnlTensorDescriptor_t> x_raw_descs;
-    std::vector<const void *> x_ptrs;
-    for (int i = 0; i < num; i++) {
-      if (x[i]->dims().size() != 0) {
-        std::vector<int64_t> in_dims = phi::vectorize(x[i]->dims());
-        in_dims.insert(in_dims.begin() + axis, 1);
-        x_descs.emplace_back(MLUCnnlTensorDesc(
-            in_dims.size(), in_dims.data(), ToCnnlDataType<T>()));
-      } else {
-        int input_dims = 1;
-        x_descs.emplace_back(
-            MLUCnnlTensorDesc(1, &input_dims, ToCnnlDataType<T>()));
-      }
-      x_raw_descs.push_back(x_descs.back().get());
-      x_ptrs.push_back(GetBasePtr(x[i]));
-    }
-    y->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc y_desc(*y);
-    MLUCnnl::Concat(ctx,
-                    num,
-                    axis,
-                    x_raw_descs.data(),
-                    x_ptrs.data(),
-                    y_desc.get(),
-                    GetBasePtr(y));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(unstack,
-                       ops::UnStackMLUKernel<float>,
-                       ops::UnStackMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(unstack_grad,
-                       ops::UnStackGradMLUKernel<float>,
-                       ops::UnStackGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/where_index_op_mlu.cc
+++ b/paddle/fluid/operators/where_index_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class MLUWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<phi::DenseTensor>("Condition");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    auto dims = condition->dims();
-    const int rank = dims.size();
-    phi::DenseTensor num_true;
-    num_true.mutable_data<int>({1}, context.GetPlace());
-    MLUCnnlTensorDesc con_desc(*condition);
-    MLUCnnlTensorDesc num_true_desc(num_true);
-    MLUCnnl::NumTrue(context,
-                     con_desc.get(),
-                     GetBasePtr(condition),
-                     num_true_desc.get(),
-                     GetBasePtr(&num_true));
-    phi::DenseTensor local_true_num;
-    paddle::framework::TensorCopySync(
-        num_true, platform::CPUPlace(), &local_true_num);
-    auto true_num = *local_true_num.data<int>();
-    out->Resize(phi::make_ddim({true_num, rank}));
-    out->mutable_data<int64_t>(context.GetPlace());
-    if (true_num == 0) {
-      return;
-    }
-    auto& dev_ctx = context.template device_context<MLUDeviceContext>();
-    phi::DenseTensor out_int32 =
-        context.AllocateTmpTensor<int32_t, MLUDeviceContext>(out->dims(),
-                                                             dev_ctx);
-    MLUCnnlTensorDesc out_int32_desc(out_int32);
-    MLUCnnlTensorDesc out_desc(*out);
-    bool as_tuple = false;
-    MLUCnnl::Where(context,
-                   con_desc.get(),
-                   GetBasePtr(condition),
-                   num_true_desc.get(),
-                   GetBasePtr(&num_true),
-                   as_tuple,
-                   out_int32_desc.get(),
-                   GetBasePtr(&out_int32));
-    cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
-    MLUCnnl::Cast(context,
-                  cast_type,
-                  out_int32_desc.get(),
-                  GetBasePtr(&out_int32),
-                  out_desc.get(),
-                  GetBasePtr(out));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(where_index,
-                       ops::MLUWhereIndexKernel<int>,
-                       ops::MLUWhereIndexKernel<bool>,
-                       ops::MLUWhereIndexKernel<float>);
--- a/paddle/fluid/operators/where_op_mlu.cc
+++ b/paddle/fluid/operators/where_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class WhereMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<phi::DenseTensor>("Condition");
-    auto* X = context.Input<phi::DenseTensor>("X");
-    auto* Y = context.Input<phi::DenseTensor>("Y");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    auto place = context.GetPlace();
-    out->mutable_data<T>(place);
-    MLUCnnlTensorDesc x_desc(*X);
-    MLUCnnlTensorDesc y_desc(*Y);
-    MLUCnnlTensorDesc condition_desc(*condition);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::Select(context,
-                    condition_desc.get(),
-                    GetBasePtr(condition),
-                    x_desc.get(),
-                    GetBasePtr(X),
-                    y_desc.get(),
-                    GetBasePtr(Y),
-                    out_desc.get(),
-                    GetBasePtr(out));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(
-    where,
-    ops::WhereMLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::WhereMLUKernel<paddle::platform::MLUDeviceContext, int>);
-#endif