delete paddle/fluid/operators/*_mlu.* files (#52435)

delete paddle/fluid/operators/_mlu. files (#52435)
bb48b596 · Young-Flash · GitHub · 0e3f7ab1 · 0e3f7ab1 · 0e3f7ab1
70 changed file
--- a/paddle/fluid/operators/abs_op_mlu.cc
+++ b/paddle/fluid/operators/abs_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class AbsMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc output_desc(*output);
-
-    MLUCnnl::Abs(ctx,
-                 input_desc.get(),
-                 GetBasePtr(input),
-                 output_desc.get(),
-                 GetBasePtr(output));
-  }
-};
-
-template <typename T>
-class AbsGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc input_desc(*x);
-    MLUCnnlOpTensorDesc mul_op_desc(
-        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
-
-    phi::DenseTensor sign_x;
-    sign_x.mutable_data<T>(x->dims(), ctx.GetPlace());
-
-    MLUCnnl::Sign(ctx,
-                  input_desc.get(),
-                  GetBasePtr(x),
-                  input_desc.get(),
-                  GetBasePtr(&sign_x));
-    MLUCnnl::OpTensor(ctx,
-                      mul_op_desc.get(),
-                      input_desc.get(),
-                      GetBasePtr(&sign_x),
-                      input_desc.get(),
-                      GetBasePtr(dout),
-                      input_desc.get(),
-                      GetBasePtr(dx),
-                      ToCnnlDataType<T>());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(abs,
-                       ops::AbsMLUKernel<float>,
-                       ops::AbsMLUKernel<plat::float16>);
-
-REGISTER_OP_MLU_KERNEL(abs_grad,
-                       ops::AbsGradMLUKernel<float>,
-                       ops::AbsGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/activation_op_mlu.cc
+++ b/paddle/fluid/operators/activation_op_mlu.cc
--- a/paddle/fluid/operators/arg_max_op_mlu.cc
+++ b/paddle/fluid/operators/arg_max_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ArgMaxMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto axis = static_cast<int>(ctx.Attr<int64_t>("axis"));
-    auto dtype = ctx.Attr<int>("dtype");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-
-    if (x->numel() == 0) return;
-    PADDLE_ENFORCE_EQ(
-        (dtype == 2 || dtype == 3),
-        true,
-        platform::errors::InvalidArgument(
-            "The attribute of dtype in argmax op must be [%s] or [%s], "
-            "but "
-            "received [%s]",
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                static_cast<framework::proto::VarType::Type>(dtype))));
-
-    if (axis < 0) {
-      framework::DDim x_dims;
-      x_dims = x->dims();
-      axis += x_dims.size();
-    }
-
-    phi::DenseTensor flatten_x(x->type());
-    flatten_x.ShareDataWith(*x);
-    if (flatten) {
-      flatten_x.Resize(phi::make_ddim({x->numel()}));
-      // if flatten, the axis just as 0
-      axis = 0;
-    }
-    std::vector<int> reduce_dims;
-    reduce_dims.push_back(axis);
-
-    auto out_dims = out->dims();
-    int out_count = out_dims[0];
-    for (int i = 1; i < out_dims.size(); i++) {
-      out_count = out_count * out_dims[i];
-    }
-    size_t indices_size_inbytes = out_count * sizeof(int32_t);
-    auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
-    phi::DenseTensor value_out =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(out->dims(), dev_ctx);
-    MLUCnnlTensorDesc value_out_desc(value_out);
-    MLUCnnlTensorDesc input_desc(
-        flatten_x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(flatten_x.dtype()));
-    MLUCnnlReduceDesc reduction_desc(reduce_dims,
-                                     CNNL_REDUCE_MAX,
-                                     ToCnnlDataType<T>(),
-                                     CNNL_NOT_PROPAGATE_NAN,
-                                     CNNL_REDUCE_ONLY_INDICES,
-                                     CNNL_32BIT_INDICES);
-
-    if (dtype == 2) {
-      out->template mutable_data<int32_t>(ctx.GetPlace());
-      MLUCnnl::Reduce(ctx,
-                      true /*need_workspace*/,
-                      reduction_desc.get(),
-                      nullptr,
-                      input_desc.get(),
-                      GetBasePtr(&flatten_x),
-                      indices_size_inbytes /*indices_size*/,
-                      GetBasePtr(out),
-                      nullptr,
-                      value_out_desc.get(),
-                      GetBasePtr(&value_out));
-    } else {
-      out->template mutable_data<int64_t>(ctx.GetPlace());
-      phi::DenseTensor out_int32 =
-          ctx.AllocateTmpTensor<int32_t, MLUDeviceContext>(out->dims(),
-                                                           dev_ctx);
-      MLUCnnl::Reduce(ctx,
-                      true /*need_workspace*/,
-                      reduction_desc.get(),
-                      nullptr,
-                      input_desc.get(),
-                      GetBasePtr(&flatten_x),
-                      indices_size_inbytes /*indices_size*/,
-                      GetBasePtr(&out_int32),
-                      nullptr,
-                      value_out_desc.get(),
-                      GetBasePtr(&value_out));
-
-      // cast indices type to int64
-      MLUCnnlTensorDesc out_int32_desc(out_int32);
-      MLUCnnlTensorDesc cast_output_desc(*out);
-      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    out_int32_desc.get(),
-                    GetBasePtr(&out_int32),
-                    cast_output_desc.get(),
-                    GetBasePtr(out));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_MLU_KERNEL(arg_max,
-                       ops::ArgMaxMLUKernel<int>,
-                       ops::ArgMaxMLUKernel<float>,
-                       ops::ArgMaxMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/argsort_op_mlu.cc
+++ b/paddle/fluid/operators/argsort_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ArgsortMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
-    const auto& place = ctx.GetPlace();
-
-    const auto& sorted = true;
-    const bool descending = ctx.Attr<bool>("descending");
-
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    if (axis < 0) {
-      const auto& in_dims = input->dims();
-      axis += in_dims.size();
-    }
-
-    auto in_dims = input->dims();
-    size_t k = in_dims[axis];
-
-    output->mutable_data<T>(place);
-    indices->mutable_data<int64_t>(place);
-
-    // cnnl only support int32/int16 type of indices
-    phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
-    indices_int32.Resize(indices->dims());
-    indices_int32.mutable_data<int32_t>(place);
-
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc values_output_desc(*output);
-    MLUCnnlTensorDesc indices_int32_desc(indices_int32);
-    MLUCnnl::TopK(ctx,
-                  k,
-                  axis,
-                  descending,
-                  sorted,
-                  input_desc.get(),
-                  GetBasePtr(input),
-                  values_output_desc.get(),
-                  GetBasePtr(output),
-                  indices_int32_desc.get(),
-                  GetBasePtr(&indices_int32));
-
-    // cast indices type to int64
-    MLUCnnlTensorDesc cast_output_desc(*indices);
-    cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
-    MLUCnnl::Cast(ctx,
-                  cast_type,
-                  indices_int32_desc.get(),
-                  GetBasePtr(&indices_int32),
-                  cast_output_desc.get(),
-                  GetBasePtr(indices));
-  }
-};
-
-template <typename T>
-class ArgsortGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto in_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    if (dout->numel() == 0) return;
-
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnlTensorDesc indices_desc(*indices);
-    MLUCnnlTensorDesc dx_desc(*dx);
-    MLUCnnl::ScatterFunctor(ctx,
-                            dx_desc.get(),
-                            GetBasePtr(dx),
-                            dout_desc.get(),
-                            GetBasePtr(dout),
-                            indices_desc.get(),
-                            GetBasePtr(indices),
-                            axis);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(argsort,
-                       ops::ArgsortMLUKernel<paddle::platform::float16>,
-                       ops::ArgsortMLUKernel<float>,
-                       ops::ArgsortMLUKernel<int8_t>,
-                       ops::ArgsortMLUKernel<uint8_t>,
-                       ops::ArgsortMLUKernel<int16_t>,
-                       ops::ArgsortMLUKernel<int>);
-
-REGISTER_OP_MLU_KERNEL(argsort_grad,
-                       ops::ArgsortGradMLUKernel<paddle::platform::float16>,
-                       ops::ArgsortGradMLUKernel<float>,
-                       ops::ArgsortGradMLUKernel<int8_t>,
-                       ops::ArgsortGradMLUKernel<uint8_t>,
-                       ops::ArgsortGradMLUKernel<int16_t>,
-                       ops::ArgsortGradMLUKernel<int>);
--- a/paddle/fluid/operators/assign_op_mlu.cc
+++ b/paddle/fluid/operators/assign_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/operators/assign_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class AssignMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::Assign(
-        ctx, x_desc.get(), GetBasePtr(x), out_desc.get(), GetBasePtr(out));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(assign,
-                       ops::AssignMLUKernel<int>,
-                       ops::AssignMLUKernel<float>,
-                       ops::AssignMLUKernel<plat::float16>,
-                       ops::AssignMLUKernel<bool>)
--- a/paddle/fluid/operators/assign_value_op_mlu.cc
+++ b/paddle/fluid/operators/assign_value_op_mlu.cc
-//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/assign_value_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_MLU_KERNEL(assign_value,
-                       ops::AssignValueKernel<bool>,
-                       ops::AssignValueKernel<int>,
-                       ops::AssignValueKernel<int64_t>,
-                       ops::AssignValueKernel<float>);
--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class MLUBatchNormOpKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto &place = ctx.GetPlace();
-    const float epsilon = ctx.Attr<float>("epsilon");
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    bool test_mode = is_test && (!trainable_stats);
-
-    bool global_stats = test_mode || use_global_stats;
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(
-        x_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be larger than 1."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        x_dims.size(),
-        5,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be less than 6."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-
-    const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
-    const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-
-    auto *y = ctx.Output<phi::DenseTensor>("Y");
-    auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
-    auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
-    auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-
-    // alloc memory
-    y->mutable_data<T>(place);
-    mean_out->mutable_data<MPDType>(place);
-    variance_out->mutable_data<MPDType>(place);
-    saved_mean->mutable_data<MPDType>(place);
-    saved_variance->mutable_data<MPDType>(place);
-
-    phi::DenseTensor transformed_x;
-    phi::DenseTensor transformed_y;
-    const int transformed_dim_size = 4;
-    const int transformed_shape[transformed_dim_size] = {N, sample_size, 1, C};
-    MLUCnnlTensorDesc transformed_desc(transformed_dim_size,
-                                       transformed_shape,
-                                       ToCnnlDataType<T>(),
-                                       CNNL_LAYOUT_NHWC);
-    MLUCnnlTensorDesc others_input_desc(*scale);
-    // input dimension is 2 and the format is NCHW. The input can be regarded as
-    // NHWC format. Don't need to transpose.
-    bool need_transpose =
-        (data_layout == DataLayout::kNCHW && x_dims.size() != 2);
-    if (need_transpose) {
-      auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
-      transformed_x = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
-      transformed_y = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
-
-      const int x_reshaped[] = {N, C, sample_size, 1};
-      MLUCnnlTensorDesc x_reshaped_desc(
-          transformed_dim_size, x_reshaped, ToCnnlDataType<T>());
-      const std::vector<int> perm = {0, 2, 3, 1};
-      MLUCnnl::Transpose(ctx,
-                         perm,
-                         transformed_dim_size,
-                         x_reshaped_desc.get(),
-                         GetBasePtr(x),
-                         transformed_desc.get(),
-                         GetBasePtr(&transformed_x));
-    } else {
-      transformed_x = *x;
-      transformed_y = *y;
-    }
-
-    if (ctx.HasInput("MomentumTensor")) {
-      const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-      phi::DenseTensor mom_cpu;
-      framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
-      momentum = mom_cpu.data<float>()[0];
-    }
-
-    MLUCnnl::FusedBatchNorm(ctx,
-                            !global_stats,
-                            transformed_desc.get(),
-                            GetBasePtr(&transformed_x),
-                            others_input_desc.get(),
-                            GetBasePtr(scale),
-                            GetBasePtr(bias),
-                            GetBasePtr(running_mean),
-                            GetBasePtr(running_var),
-                            epsilon,
-                            momentum,
-                            transformed_desc.get(),
-                            GetBasePtr(&transformed_y),
-                            GetBasePtr(mean_out),
-                            GetBasePtr(variance_out),
-                            GetBasePtr(saved_mean),
-                            GetBasePtr(saved_variance));
-
-    if (need_transpose) {
-      const int y_reshaped[] = {N, C, sample_size, 1};
-      MLUCnnlTensorDesc y_reshaped_desc(
-          transformed_dim_size, y_reshaped, ToCnnlDataType<T>());
-      const std::vector<int> perm = {0, 3, 1, 2};
-      MLUCnnl::Transpose(ctx,
-                         perm,
-                         transformed_y.dims().size(),
-                         transformed_desc.get(),
-                         GetBasePtr(&transformed_y),
-                         y_reshaped_desc.get(),
-                         GetBasePtr(y));
-    }
-  }
-};
-
-template <typename T>
-class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
-    // SavedVariance have been reverted in forward operator
-    const auto *saved_inv_variance =
-        ctx.Input<phi::DenseTensor>("SavedVariance");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const float epsilon = ctx.Attr<float>("epsilon");
-    DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
-    auto d_x_tmp =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(x->dims(), dev_ctx);
-    auto scale_grad_tmp = ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(
-        scale->dims(), dev_ctx);
-    auto bias_grad_tmp =
-        ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(bias->dims(), dev_ctx);
-
-    if (d_x == nullptr) {
-      d_x = &d_x_tmp;
-    }
-    if (d_scale == nullptr) {
-      d_scale = &scale_grad_tmp;
-    }
-    if (d_bias == nullptr) {
-      d_bias = &bias_grad_tmp;
-    }
-
-    const auto &place = ctx.GetPlace();
-    d_x->mutable_data<T>(place);
-    d_scale->mutable_data<MPDType>(place);
-    d_bias->mutable_data<MPDType>(place);
-
-    use_global_stats = is_test || use_global_stats;
-
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(
-        x_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be larger than 1."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        x_dims.size(),
-        5,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be less than 6."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-
-    phi::DenseTensor transformed_d_y;
-    phi::DenseTensor transformed_x;
-    phi::DenseTensor transformed_d_x;
-    const int transformed_dim_size = 4;
-    const int transformed_shape[transformed_dim_size] = {N, sample_size, 1, C};
-
-    MLUCnnlTensorDesc transformed_desc(transformed_dim_size,
-                                       transformed_shape,
-                                       ToCnnlDataType<T>(),
-                                       CNNL_LAYOUT_NHWC);
-    MLUCnnlTensorDesc others_input_desc(*scale);
-
-    bool need_transpose =
-        (data_layout == DataLayout::kNCHW && x_dims.size() != 2);
-    if (need_transpose) {
-      transformed_d_y = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
-      transformed_x = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
-      transformed_d_x = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
-          framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
-      const int org_reshaped[] = {N, C, sample_size, 1};
-      MLUCnnlTensorDesc org_reshaped_desc(
-          transformed_dim_size, org_reshaped, ToCnnlDataType<T>());
-      const std::vector<int> perm = {0, 2, 3, 1};
-      MLUCnnl::Transpose(ctx,
-                         perm,
-                         transformed_dim_size,
-                         org_reshaped_desc.get(),
-                         GetBasePtr(d_y),
-                         transformed_desc.get(),
-                         GetBasePtr(&transformed_d_y));
-      MLUCnnl::Transpose(ctx,
-                         perm,
-                         transformed_dim_size,
-                         org_reshaped_desc.get(),
-                         GetBasePtr(x),
-                         transformed_desc.get(),
-                         GetBasePtr(&transformed_x));
-    } else {
-      transformed_d_y = *d_y;
-      transformed_x = *x;
-      transformed_d_x = *d_x;
-    }
-
-    if (use_global_stats) {
-      const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
-      const auto *running_variance = ctx.Input<phi::DenseTensor>("Variance");
-      MLUCnnl::FusedBatchNormGrad(ctx,
-                                  false /*is_training*/,
-                                  transformed_desc.get(),
-                                  GetBasePtr(&transformed_d_y),
-                                  transformed_desc.get(),
-                                  GetBasePtr(&transformed_x),
-                                  others_input_desc.get(),
-                                  GetBasePtr(scale),
-                                  GetBasePtr(running_mean),
-                                  GetBasePtr(running_variance),
-                                  epsilon,
-                                  transformed_desc.get(),
-                                  GetBasePtr(&transformed_d_x),
-                                  GetBasePtr(d_scale),
-                                  GetBasePtr(d_bias));
-    } else {
-      MLUCnnl::FusedBatchNormGrad(ctx,
-                                  true /*is_training*/,
-                                  transformed_desc.get(),
-                                  GetBasePtr(&transformed_d_y),
-                                  transformed_desc.get(),
-                                  GetBasePtr(&transformed_x),
-                                  others_input_desc.get(),
-                                  GetBasePtr(scale),
-                                  GetBasePtr(saved_mean),
-                                  GetBasePtr(saved_inv_variance),
-                                  epsilon,
-                                  transformed_desc.get(),
-                                  GetBasePtr(&transformed_d_x),
-                                  GetBasePtr(d_scale),
-                                  GetBasePtr(d_bias));
-    }
-
-    if (need_transpose) {
-      const int d_x_reshaped[] = {N, C, sample_size, 1};
-      MLUCnnlTensorDesc d_x_reshaped_desc(
-          transformed_dim_size, d_x_reshaped, ToCnnlDataType<T>());
-      const std::vector<int> perm = {0, 3, 1, 2};
-      MLUCnnl::Transpose(ctx,
-                         perm,
-                         transformed_dim_size,
-                         transformed_desc.get(),
-                         GetBasePtr(&transformed_d_x),
-                         d_x_reshaped_desc.get(),
-                         GetBasePtr(d_x));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(batch_norm,
-                       ops::MLUBatchNormOpKernel<float>,
-                       ops::MLUBatchNormOpKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(batch_norm_grad,
-                       ops::MLUBatchNormGradOpKernel<float>,
-                       ops::MLUBatchNormGradOpKernel<plat::float16>);
--- a/paddle/fluid/operators/bce_loss_op_mlu.cc
+++ b/paddle/fluid/operators/bce_loss_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class BCELossMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc label_desc(*labels);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::BceLoss(ctx,
-                     CNNL_BCE_LOSS_NONE,
-                     x_desc.get(),
-                     GetBasePtr(x),
-                     label_desc.get(),
-                     GetBasePtr(labels),
-                     nullptr,
-                     nullptr,
-                     out_desc.get(),
-                     GetBasePtr(out));
-  }
-};
-
-template <typename T>
-class BCELossGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc label_desc(*labels);
-    MLUCnnlTensorDesc dout_desc(*dout);
-    MLUCnnl::BceLossBackward(ctx,
-                             CNNL_BCE_LOSS_NONE,
-                             dout_desc.get(),
-                             GetBasePtr(dout),
-                             x_desc.get(),
-                             GetBasePtr(x),
-                             label_desc.get(),
-                             GetBasePtr(labels),
-                             nullptr,
-                             nullptr,
-                             x_desc.get(),
-                             GetBasePtr(dx));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(bce_loss,
-                       ops::BCELossMLUKernel<float>,
-                       ops::BCELossMLUKernel<plat::float16>);
-
-REGISTER_OP_MLU_KERNEL(bce_loss_grad,
-                       ops::BCELossGradMLUKernel<float>,
-                       ops::BCELossGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/cast_op_mlu.cc
+++ b/paddle/fluid/operators/cast_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/platform/device/mlu/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CastMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto src_type = static_cast<VT::Type>(ctx.Attr<int>("in_dtype"));
-    auto dst_type = static_cast<VT::Type>(ctx.Attr<int>("out_dtype"));
-    auto place = ctx.GetPlace();
-
-    if (src_type == dst_type) {
-      auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-      output->mutable_data<T>(place);
-      framework::TensorCopy(*input, place, dev_ctx, output);
-      return;
-    }
-
-    PADDLE_ENFORCE_EQ(MLUSupportsCast(src_type, dst_type),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "MLU not support cast [%d] to [%d]",
-                          framework::DataTypeToString(src_type),
-                          framework::DataTypeToString(dst_type)));
-
-    output->mutable_data(place, framework::TransToPhiDataType(dst_type));
-
-    MLUCnnlTensorDesc input_desc(*input);
-    MLUCnnlTensorDesc output_desc(*output);
-    cnnlCastDataType_t cast_type = GetCastDataType(src_type, dst_type);
-
-    MLUCnnl::Cast(ctx,
-                  cast_type,
-                  input_desc.get(),
-                  GetBasePtr(input),
-                  output_desc.get(),
-                  GetBasePtr(output));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(cast,
-                       ops::CastMLUKernel<float>,
-                       ops::CastMLUKernel<int>,
-                       ops::CastMLUKernel<int16_t>,
-                       ops::CastMLUKernel<uint8_t>,
-                       ops::CastMLUKernel<bool>,
-                       ops::CastMLUKernel<int64_t>,
-                       ops::CastMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/clip_op_mlu.cc
+++ b/paddle/fluid/operators/clip_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ClipMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto min = static_cast<T>(ctx.Attr<float>("min"));
-    auto max = static_cast<T>(ctx.Attr<float>("max"));
-
-    if (ctx.HasInput("Min")) {
-      phi::DenseTensor min_cpu;
-      auto* min_tensor = ctx.Input<phi::DenseTensor>("Min");
-      auto* min_data = min_tensor->data<T>();
-      if (platform::is_mlu_place(min_tensor->place())) {
-        paddle::framework::TensorCopySync(
-            *min_tensor, platform::CPUPlace(), &min_cpu);
-        min_data = min_cpu.data<T>();
-      }
-      min = min_data[0];
-    }
-
-    if (ctx.HasInput("Max")) {
-      phi::DenseTensor max_cpu;
-      auto* max_tensor = ctx.Input<phi::DenseTensor>("Max");
-      auto* max_data = max_tensor->data<T>();
-      if (platform::is_mlu_place(max_tensor->place())) {
-        paddle::framework::TensorCopySync(
-            *max_tensor, platform::CPUPlace(), &max_cpu);
-        max_data = max_cpu.data<T>();
-      }
-      max = max_data[0];
-    }
-    out->mutable_data<T>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::Clip(ctx,
-                  x_desc.get(),
-                  GetBasePtr(x),
-                  static_cast<const void*>(&min),
-                  static_cast<const void*>(&max),
-                  GetBasePtr(out));
-  }
-};
-
-template <typename T>
-class ClipGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto* min_tensor =
-        ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
-    auto* max_tensor =
-        ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
-
-    auto min_val = ctx.Attr<float>("min");
-    if (min_tensor) {
-      phi::DenseTensor min_data;
-      framework::TensorCopy(
-          *min_tensor,
-          platform::CPUPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &min_data);
-      ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
-      min_val = static_cast<float>(min_data.data<T>()[0]);
-    }
-    auto max_val = ctx.Attr<float>("max");
-    if (max_tensor) {
-      phi::DenseTensor max_data;
-      framework::TensorCopy(
-          *max_tensor,
-          platform::CPUPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &max_data);
-      ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
-      max_val = static_cast<float>(max_data.data<T>()[0]);
-    }
-
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc dx_desc(*dx);
-    MLUCnnlTensorDesc dout_desc(*dout);
-
-    MLUCnnl::HardtanhBackward(ctx,
-                              x_desc.get(),
-                              GetBasePtr(x),
-                              dout_desc.get(),
-                              GetBasePtr(dout),
-                              max_val,
-                              min_val,
-                              dx_desc.get(),
-                              GetBasePtr(dx));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(clip,
-                       ops::ClipMLUKernel<float>,
-                       ops::ClipMLUKernel<plat::float16>);
-
-REGISTER_OP_MLU_KERNEL(clip_grad,
-                       ops::ClipGradMLUKernel<float>,
-                       ops::ClipGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/concat_op_mlu.cc
+++ b/paddle/fluid/operators/concat_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/concat_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/phi/core/tensor_utils.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ConcatMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
-    PADDLE_ENFORCE_NOT_NULL(ins[0],
-                            platform::errors::NotFound(
-                                "The first input tensor is not initalized."));
-    auto axis = ctx.Attr<int>("axis");
-    auto ins_size = ins.size();
-    bool need_resize_out_dims = false;
-    if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
-      axis = phi::GetVectorFromTensor<int>(axis_tensor)[0];
-      need_resize_out_dims = true;
-    }
-    axis = ComputeAxis(static_cast<int64_t>(axis),
-                       static_cast<int64_t>(ins[0]->dims().size()));
-
-    if (need_resize_out_dims) {
-      const size_t n = ins.size();
-      std::vector<framework::DDim> ins_dims(n);
-      for (size_t i = 0; i < n; i++) {
-        ins_dims[i] = ins[i]->dims();
-      }
-
-      framework::DDim out_dims =
-          phi::funcs::ComputeAndCheckShape(true, ins_dims, axis);
-      out->Resize(out_dims);
-    }
-    const int axis_t = axis;
-    const int ins_size_t = ins_size;
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-
-    // mlu should do sth
-    // init ins tensors
-    std::vector<const void*> inputs;
-    std::vector<MLUCnnlTensorDesc> input_descs;
-    std::vector<cnnlTensorDescriptor_t> desc_vector;
-    for (size_t i = 0; i < ins_size; i++) {
-      input_descs.emplace_back(MLUCnnlTensorDesc(
-          *ins[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(ins[i]->dtype())));
-      desc_vector.push_back(input_descs.back().get());
-      inputs.push_back(GetBasePtr(ins[i]));
-    }
-    // init out tensors
-    MLUCnnlTensorDesc output_desc(
-        *out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
-
-    // MLU should do sth
-    MLUCnnl::Concat(ctx,
-                    ins_size_t,
-                    axis_t,
-                    desc_vector.data(),
-                    inputs.data(),
-                    output_desc.get(),
-                    GetBasePtr(out));
-  }
-};
-
-template <typename T>
-class ConcatGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
-    auto outs = ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
-    auto axis = ctx.Attr<int>("axis");
-    int split_num = ins.size();
-
-    PADDLE_ENFORCE_NOT_NULL(ins[0],
-                            platform::errors::NotFound(
-                                "The first input tensor is not initalized."));
-
-    if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
-      axis = phi::GetVectorFromTensor<int>(axis_tensor)[0];
-    }
-
-    axis = ComputeAxis(static_cast<int64_t>(axis),
-                       static_cast<int64_t>(ins[0]->dims().size()));
-    PADDLE_ENFORCE_GE(axis,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "concat_grad: axis should be larger than or "
-                          "equal to 0, but received axis is %d.",
-                          axis));
-    PADDLE_ENFORCE_LT(
-        axis,
-        out_grad->dims().size(),
-        platform::errors::InvalidArgument(
-            "concat_grad: axis should be less than ins[0]->dims()!"
-            "But received axis is %d, while ins[0]->dims()"
-            "size is %d.",
-            axis,
-            out_grad->dims().size()));
-    // get output tensor that the name is not kEmptyVarName
-    std::vector<void*> outputs_vec;
-    std::vector<phi::DenseTensor> tmp_outputs_vec;
-    std::vector<MLUCnnlTensorDesc> output_descs;
-    std::vector<cnnlTensorDescriptor_t> descs_vec;
-    for (size_t j = 0; j < outs.size(); ++j) {
-      if (out_var_names[j] != framework::kEmptyVarName &&
-          outs[j]->numel() != 0UL) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        output_descs.emplace_back(MLUCnnlTensorDesc(*outs[j]));
-        outputs_vec.push_back(GetBasePtr(outs[j]));
-      } else {
-        phi::DenseTensor tmp_tensor;
-        tmp_tensor.mutable_data<T>(ins[j]->dims(), ctx.GetPlace());
-        tmp_outputs_vec.push_back(tmp_tensor);
-        output_descs.emplace_back(MLUCnnlTensorDesc(*ins[j]));
-        outputs_vec.push_back(GetBasePtr(&(tmp_outputs_vec.back())));
-      }
-      descs_vec.push_back(output_descs.back().get());
-    }
-
-    MLUCnnlTensorDesc out_grad_desc(*out_grad);
-    MLUCnnl::Split(ctx,
-                   static_cast<int>(split_num),
-                   static_cast<int>(axis),
-                   out_grad_desc.get(),
-                   GetBasePtr(out_grad),
-                   descs_vec.data(),
-                   outputs_vec.data());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_MLU_KERNEL(concat,
-                       ops::ConcatMLUKernel<float>,
-                       ops::ConcatMLUKernel<paddle::platform::float16>,
-                       ops::ConcatMLUKernel<int64_t>,
-                       ops::ConcatMLUKernel<bool>,
-                       ops::ConcatMLUKernel<int>,
-                       ops::ConcatMLUKernel<uint8_t>);
-REGISTER_OP_MLU_KERNEL(concat_grad,
-                       ops::ConcatGradMLUKernel<float>,
-                       ops::ConcatGradMLUKernel<paddle::platform::float16>,
-                       ops::ConcatGradMLUKernel<int64_t>,
-                       ops::ConcatGradMLUKernel<bool>,
-                       ops::ConcatGradMLUKernel<int>,
-                       ops::ConcatGradMLUKernel<uint8_t>);
--- a/paddle/fluid/operators/conv_op_mlu.cc
+++ b/paddle/fluid/operators/conv_op_mlu.cc
--- a/paddle/fluid/operators/conv_transpose_op_mlu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/phi/kernels/cpu/conv_util.h"
-
-namespace paddle {
-namespace operators {
-
-using DataLayout = phi::DataLayout;
-
-template <typename T>
-class Conv2DTransposeMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> output_padding =
-        ctx.Attr<std::vector<int>>("output_padding");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-
-    // check dimension
-    const bool channel_last = data_format == "NHWC";
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    auto in_dims_size = in_dims.size();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    phi::DenseTensor input_tensor(input->type());
-    phi::DenseTensor output_tensor(output->type());
-    input_tensor.set_layout(DataLayout::kNHWC);
-    output_tensor.set_layout(DataLayout::kNHWC);
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-
-    if (channel_last) {
-      input_tensor.ShareDataWith(*input);
-      output_tensor.ShareDataWith(*output);
-    } else {
-      // transpose input from NCHW to NHWC
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                input,
-                                &input_tensor,
-                                true /*need_reshape_or_alloc*/);
-      auto output_dims = output->dims();
-      output_tensor.mutable_data<T>(
-          {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
-          ctx.GetPlace());
-    }
-
-    // transpose filter from MCHW to MHWC
-    phi::DenseTensor trans_filter(filter->type());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              filter,
-                              &trans_filter,
-                              true /*need_reshape_or_alloc*/);
-
-    // construct MLU attr
-    cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
-    MLUCnnlTensorDesc input_desc(
-        input_tensor, data_layout, ToCnnlDataType(input_tensor.dtype()));
-    MLUCnnlTensorDesc filter_desc(
-        trans_filter, data_layout, ToCnnlDataType(trans_filter.type()));
-    MLUCnnlTensorDesc output_desc(
-        output_tensor, data_layout, ToCnnlDataType(output_tensor.dtype()));
-    MLUCnnlConvolutionDesc conv_desc(in_dims_size,
-                                     paddings.data(),
-                                     strides.data(),
-                                     dilations.data(),
-                                     groups,
-                                     ToCnnlDataType<T>());
-
-    MLUCnnl::ConvBackpropInput(ctx,
-                               conv_desc.get(),
-                               filter_desc.get(),
-                               GetBasePtr(&trans_filter),
-                               input_desc.get(),
-                               GetBasePtr(&input_tensor),
-                               output_desc.get(),
-                               GetBasePtr(&output_tensor));
-
-    if (!channel_last) {
-      // transpose output from NHWC to NCHW
-      const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &output_tensor,
-                                output,
-                                false /*need_reshape_or_alloc*/);
-    }
-  }
-};
-
-template <typename T>
-class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    const phi::DenseTensor* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    phi::DenseTensor* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    phi::DenseTensor* filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-
-    if ((!input_grad) && (!filter_grad)) return;
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const int groups = ctx.Attr<int>("groups");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const phi::DataLayout data_layout = phi::StringToDataLayout(data_format);
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    auto in_dims_size = in_dims.size();
-
-    const bool channel_last = (data_layout == phi::DataLayout::kNHWC);
-
-    framework::DDim in_data_dims;
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    phi::DenseTensor input_tensor(input->type());
-    phi::DenseTensor output_grad_tensor(output_grad->type());
-    output_grad_tensor.set_layout(DataLayout::kNHWC);
-
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    if (channel_last) {
-      input_tensor.ShareDataWith(*input);
-      output_grad_tensor.ShareDataWith(*output_grad);
-    } else {
-      // transpose input from NCHW to NHWC
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                input,
-                                &input_tensor,
-                                true /*need_reshape_or_alloc*/);
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nhwc,
-                                output_grad,
-                                &output_grad_tensor,
-                                true /*need_reshape_or_alloc*/);
-    }
-
-    // transpose filter from MCHW to MHWC
-    phi::DenseTensor trans_filter(filter->type());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              filter,
-                              &trans_filter,
-                              true /*need_reshape_or_alloc*/);
-
-    // MLU descs
-    cnnlTensorLayout_t data_layout_mlu = CNNL_LAYOUT_NHWC;
-    MLUCnnlTensorDesc input_desc(
-        input_tensor, data_layout_mlu, ToCnnlDataType(input_tensor.dtype()));
-    MLUCnnlTensorDesc trans_filter_desc(
-        trans_filter, data_layout_mlu, ToCnnlDataType(trans_filter.type()));
-    MLUCnnlTensorDesc output_grad_desc(
-        output_grad_tensor,
-        data_layout_mlu,
-        ToCnnlDataType(output_grad_tensor.dtype()));
-    MLUCnnlConvolutionDesc conv_desc(in_dims_size,
-                                     paddings.data(),
-                                     strides.data(),
-                                     dilations.data(),
-                                     groups,
-                                     ToCnnlDataType<T>());
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor filter_grad_tensor(filter_grad->type());
-      // filter_grad always MCHW
-      // filter_grad_tensor always MHWC
-      auto filter_grad_dims = filter_grad->dims();
-      filter_grad_tensor.mutable_data<T>({filter_grad_dims[0],
-                                          filter_grad_dims[2],
-                                          filter_grad_dims[3],
-                                          filter_grad_dims[1]},
-                                         ctx.GetPlace());
-      //}
-      filter_grad_tensor.set_layout(DataLayout::kNHWC);
-
-      MLUCnnlTensorDesc filter_grad_desc(
-          filter_grad_tensor,
-          data_layout_mlu,
-          ToCnnlDataType(filter_grad_tensor.dtype()));
-
-      MLUCnnl::ConvBackpropFilter(ctx,
-                                  conv_desc.get(),
-                                  output_grad_desc.get(),
-                                  GetBasePtr(output_grad),
-                                  input_desc.get(),
-                                  GetBasePtr(&input_tensor),
-                                  filter_grad_desc.get(),
-                                  GetBasePtr(&filter_grad_tensor));
-      // transpose output from MHWC to MCHW
-      const std::vector<int> perm_to_mchw = {0, 3, 1, 2};
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_mchw,
-                                &filter_grad_tensor,
-                                filter_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor input_grad_tensor(input_grad->type());
-      input_tensor.set_layout(DataLayout::kNHWC);
-
-      if (channel_last) {
-        input_grad_tensor.ShareDataWith(*input_grad);
-      } else {
-        auto input_grad_dims = input_grad->dims();
-        input_grad_tensor.mutable_data<T>({input_grad_dims[0],
-                                           input_grad_dims[2],
-                                           input_grad_dims[3],
-                                           input_grad_dims[1]},
-                                          ctx.GetPlace());
-      }
-
-      MLUCnnlTensorDesc input_grad_desc(
-          input_grad_tensor,
-          data_layout_mlu,
-          ToCnnlDataType(input_grad_tensor.dtype()));
-
-      MLUCnnl::ConvolutionForward(ctx,
-                                  conv_desc.get(),
-                                  nullptr /*alpha*/,
-                                  nullptr /*beta*/,
-                                  nullptr /*bias_desc*/,
-                                  nullptr /*bias_ptr*/,
-                                  output_grad_desc.get(),
-                                  GetBasePtr(&output_grad_tensor),
-                                  trans_filter_desc.get(),
-                                  GetBasePtr(&trans_filter),
-                                  input_grad_desc.get(),
-                                  GetBasePtr(&input_grad_tensor));
-      if (!channel_last) {
-        // transpose output from NHWC to NCHW
-        const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-        TransposeFromMLUTensor<T>(ctx,
-                                  perm_to_nchw,
-                                  &input_grad_tensor,
-                                  input_grad,
-                                  false /*need_reshape_or_alloc*/);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(conv2d_transpose,
-                       ops::Conv2DTransposeMLUKernel<float>,
-                       ops::Conv2DTransposeMLUKernel<plat::float16>);
-
-REGISTER_OP_MLU_KERNEL(conv2d_transpose_grad,
-                       ops::Conv2DTransposeGradMLUKernel<float>,
-                       ops::Conv2DTransposeGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/cumsum_op_mlu.cc
+++ b/paddle/fluid/operators/cumsum_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CumSumMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int axis = ctx.Attr<int>("axis");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool reverse = ctx.Attr<bool>("reverse");
-    bool flatten = ctx.Attr<bool>("flatten");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    phi::DenseTensor* input_ptr = const_cast<phi::DenseTensor*>(x);
-    phi::DenseTensor flat_x(x->type());
-    if (flatten) {
-      PADDLE_ENFORCE_EQ(
-          axis,
-          -1,
-          platform::errors::InvalidArgument(
-              "when flatten is true, attr axis must be default %d, but got %d",
-              -1,
-              axis));
-
-      flat_x.ShareDataWith(*x);
-      flat_x.Resize(phi::make_ddim({x->numel()}));
-      input_ptr = &flat_x;
-    }
-
-    const int true_axis = (axis < 0) ? input_ptr->dims().size() + axis : axis;
-    MLUCnnlTensorDesc input_desc(*input_ptr);
-    MLUCnnlTensorDesc out_desc(*out);
-
-    MLUCnnl::Cumsum(ctx,
-                    true_axis,
-                    exclusive,
-                    reverse,
-                    input_desc.get(),
-                    GetBasePtr(input_ptr),
-                    out_desc.get(),
-                    GetBasePtr(out));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(cumsum,
-                       ops::CumSumMLUKernel<int>,
-                       ops::CumSumMLUKernel<float>,
-                       ops::CumSumMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/deformable_conv_op_mlu.cc
+++ b/paddle/fluid/operators/deformable_conv_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class DeformableConvMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* offset = ctx.Input<phi::DenseTensor>("Offset");
-    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
-    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    const int groups = ctx.Attr<int>("groups");
-    const int deformable_groups = ctx.Attr<int>("deformable_groups");
-    const int im2col_step = ctx.Attr<int>("im2col_step");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    // TODO(fwg): Remove this check when cnnl fix the bug that groups > 1.
-    PADDLE_ENFORCE_EQ(
-        groups == 1,
-        true,
-        platform::errors::InvalidArgument(
-            "MLU deformable_conv kernel only support groups == 1, but get %d.",
-            groups));
-
-    // transform paddings from {h, w} to {top, bottom, left, right}.
-    const std::vector<int> trans_paddings{
-        paddings[0], paddings[0], paddings[1], paddings[1]};
-    MLUCnnlDCNDesc dcn_desc(input->dims().size(),
-                            trans_paddings.data(),
-                            strides.data(),
-                            dilations.data(),
-                            deformable_groups,
-                            groups,
-                            im2col_step);
-
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    phi::DenseTensor trans_input(input->dtype());
-    TransposeFromMLUTensor<T>(
-        ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
-
-    phi::DenseTensor trans_offset(offset->dtype());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              offset,
-                              &trans_offset,
-                              true /*need_reshape_or_alloc*/);
-
-    phi::DenseTensor trans_mask(mask->dtype());
-    TransposeFromMLUTensor<T>(
-        ctx, perm_to_nhwc, mask, &trans_mask, true /*need_reshape_or_alloc*/);
-
-    phi::DenseTensor trans_filter(filter->dtype());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              filter,
-                              &trans_filter,
-                              true /*need_reshape_or_alloc*/);
-
-    phi::DenseTensor tmp_output(output->dtype());
-    auto output_dims = output->dims();
-    tmp_output.mutable_data<T>(
-        {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
-        ctx.GetPlace());
-
-    cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
-    MLUCnnlTensorDesc input_desc(
-        trans_input, data_layout, ToCnnlDataType(trans_input.dtype()));
-    MLUCnnlTensorDesc offset_desc(
-        trans_offset, data_layout, ToCnnlDataType(trans_offset.dtype()));
-    MLUCnnlTensorDesc mask_desc(
-        trans_mask, data_layout, ToCnnlDataType(trans_mask.dtype()));
-    MLUCnnlTensorDesc filter_desc(
-        trans_filter, data_layout, ToCnnlDataType(trans_filter.dtype()));
-    MLUCnnlTensorDesc output_desc(
-        tmp_output, data_layout, ToCnnlDataType(tmp_output.dtype()));
-    MLUCnnl::DCNForward(ctx,
-                        dcn_desc.get(),
-                        input_desc.get(),
-                        GetBasePtr(&trans_input),
-                        offset_desc.get(),
-                        GetBasePtr(&trans_offset),
-                        mask_desc.get(),
-                        GetBasePtr(&trans_mask),
-                        filter_desc.get(),
-                        GetBasePtr(&trans_filter),
-                        nullptr,
-                        nullptr,
-                        output_desc.get(),
-                        GetBasePtr(&tmp_output));
-
-    const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nchw,
-                              &tmp_output,
-                              output,
-                              false /*need_reshape_or_alloc*/);
-  }
-};
-
-template <typename T>
-class DeformableConvGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto* filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-    auto* offset_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Offset"));
-    auto* mask_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Mask"));
-
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* offset = ctx.Input<phi::DenseTensor>("Offset");
-    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
-    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
-
-    int groups = ctx.Attr<int>("groups");
-    int deformable_groups = ctx.Attr<int>("deformable_groups");
-    int im2col_step = ctx.Attr<int>("im2col_step");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    // TODO(fwg): Remove this check when cnnl fix the bug that groups > 1.
-    PADDLE_ENFORCE_EQ(groups == 1,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "MLU deformable_conv_grad kernel only support groups "
-                          "== 1, but get %d.",
-                          groups));
-
-    // transform paddings from {h, w} to {top, bottom, left, right}.
-    const std::vector<int> trans_paddings{
-        paddings[0], paddings[0], paddings[1], paddings[1]};
-    MLUCnnlDCNDesc dcn_desc(input->dims().size(),
-                            trans_paddings.data(),
-                            strides.data(),
-                            dilations.data(),
-                            deformable_groups,
-                            groups,
-                            im2col_step);
-
-    phi::DenseTensor tmp_input_grad;
-    auto input_dims = input->dims();
-    tmp_input_grad.mutable_data<T>(
-        {input_dims[0], input_dims[2], input_dims[3], input_dims[1]},
-        ctx.GetPlace());
-
-    phi::DenseTensor tmp_filter_grad;
-    auto filter_dims = filter->dims();
-    tmp_filter_grad.mutable_data<T>(
-        {filter_dims[0], filter_dims[2], filter_dims[3], filter_dims[1]},
-        ctx.GetPlace());
-
-    phi::DenseTensor tmp_offset_grad;
-    auto offset_dims = offset->dims();
-    tmp_offset_grad.mutable_data<T>(
-        {offset_dims[0], offset_dims[2], offset_dims[3], offset_dims[1]},
-        ctx.GetPlace());
-
-    phi::DenseTensor tmp_mask_grad;
-    auto mask_dims = mask->dims();
-    tmp_mask_grad.mutable_data<T>(
-        {mask_dims[0], mask_dims[2], mask_dims[3], mask_dims[1]},
-        ctx.GetPlace());
-
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    phi::DenseTensor trans_output_grad(output_grad->dtype());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              output_grad,
-                              &trans_output_grad,
-                              true /*need_reshape_or_alloc*/);
-
-    phi::DenseTensor trans_input(input->dtype());
-    TransposeFromMLUTensor<T>(
-        ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
-
-    phi::DenseTensor trans_offset(offset->dtype());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              offset,
-                              &trans_offset,
-                              true /*need_reshape_or_alloc*/);
-
-    phi::DenseTensor trans_mask(mask->dtype());
-    TransposeFromMLUTensor<T>(
-        ctx, perm_to_nhwc, mask, &trans_mask, true /*need_reshape_or_alloc*/);
-
-    phi::DenseTensor trans_filter(filter->dtype());
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nhwc,
-                              filter,
-                              &trans_filter,
-                              true /*need_reshape_or_alloc*/);
-
-    cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
-    MLUCnnlTensorDesc output_grad_desc(
-        trans_output_grad,
-        data_layout,
-        ToCnnlDataType(trans_output_grad.dtype()));
-    MLUCnnlTensorDesc input_desc(
-        trans_input, data_layout, ToCnnlDataType(trans_input.dtype()));
-    MLUCnnlTensorDesc offset_desc(
-        trans_offset, data_layout, ToCnnlDataType(trans_offset.dtype()));
-    MLUCnnlTensorDesc mask_desc(
-        trans_mask, data_layout, ToCnnlDataType(trans_mask.dtype()));
-    MLUCnnlTensorDesc filter_desc(
-        trans_filter, data_layout, ToCnnlDataType(trans_filter.dtype()));
-
-    MLUCnnl::DCNBackwardData(ctx,
-                             dcn_desc.get(),
-                             input_desc.get(),
-                             GetBasePtr(&trans_input),
-                             offset_desc.get(),
-                             GetBasePtr(&trans_offset),
-                             mask_desc.get(),
-                             GetBasePtr(&trans_mask),
-                             filter_desc.get(),
-                             GetBasePtr(&trans_filter),
-                             output_grad_desc.get(),
-                             GetBasePtr(&trans_output_grad),
-                             input_desc.get(),
-                             GetBasePtr(&tmp_input_grad),
-                             offset_desc.get(),
-                             GetBasePtr(&tmp_offset_grad),
-                             mask_desc.get(),
-                             GetBasePtr(&tmp_mask_grad));
-
-    MLUCnnl::DCNBackwardWeight(ctx,
-                               dcn_desc.get(),
-                               input_desc.get(),
-                               GetBasePtr(&trans_input),
-                               offset_desc.get(),
-                               GetBasePtr(&trans_offset),
-                               mask_desc.get(),
-                               GetBasePtr(&trans_mask),
-                               output_grad_desc.get(),
-                               GetBasePtr(&trans_output_grad),
-                               filter_desc.get(),
-                               GetBasePtr(&tmp_filter_grad),
-                               nullptr,
-                               nullptr);
-
-    const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &tmp_input_grad,
-                                input_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &tmp_filter_grad,
-                                filter_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-
-    if (offset_grad) {
-      offset_grad->mutable_data<T>(ctx.GetPlace());
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &tmp_offset_grad,
-                                offset_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-
-    if (mask_grad) {
-      mask_grad->mutable_data<T>(ctx.GetPlace());
-      TransposeFromMLUTensor<T>(ctx,
-                                perm_to_nchw,
-                                &tmp_mask_grad,
-                                mask_grad,
-                                false /*need_reshape_or_alloc*/);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(deformable_conv, ops::DeformableConvMLUKernel<float>);
-REGISTER_OP_MLU_KERNEL(deformable_conv_grad,
-                       ops::DeformableConvGradMLUKernel<float>);
--- a/paddle/fluid/operators/dropout_op_mlu.cc
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class DropoutMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto dropout_prob = ctx.Attr<float>("dropout_prob");
-    auto is_test = ctx.Attr<bool>("is_test");
-    auto* seed_tensor =
-        ctx.HasInput("Seed") ? ctx.Input<phi::DenseTensor>("Seed") : nullptr;
-    auto dropout_implementation =
-        ctx.Attr<std::string>("dropout_implementation");
-
-    const bool is_upscale = (dropout_implementation == "upscale_in_train");
-
-    out->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc out_desc(*out);
-
-    if (is_test && is_upscale) {
-      // dropout op for inference: out = input.
-      framework::TensorCopy(
-          *x,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::MLUDeviceContext>(),
-          out);
-      return;
-    } else if (!is_test) {
-      // dropout op for training: out = input * mask / ( 1.0 - dropout_prob ) or
-      // out = input * mask.
-      int seed_data = 0;
-      if (seed_tensor) {
-        if (platform::is_mlu_place(seed_tensor->place())) {
-          memory::Copy(platform::CPUPlace(),
-                       &seed_data,
-                       seed_tensor->place(),
-                       seed_tensor->data<int>(),
-                       sizeof(int));
-        } else {
-          seed_data = *(seed_tensor->data<int>());
-        }
-      } else {
-        seed_data = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
-      }
-
-      auto* mask = ctx.Output<phi::DenseTensor>("Mask");
-      mask->mutable_data<uint8_t>(ctx.GetPlace());
-      MLUCnnlTensorDesc mask_desc(*mask);
-      // Special case when dropout_prob is 1.0
-      if (dropout_prob == 1.0f) {
-        auto value_t = static_cast<T>(0.0f);
-        MLUCnnl::Fill(ctx,
-                      CNNL_POINTER_MODE_HOST,
-                      &value_t,
-                      out_desc.get(),
-                      GetBasePtr(out));
-        MLUCnnl::Fill(ctx,
-                      CNNL_POINTER_MODE_HOST,
-                      &value_t,
-                      mask_desc.get(),
-                      GetBasePtr(mask));
-        return;
-      }
-
-      // create mlu random generator
-      const int device_id = ctx.GetPlace().GetDeviceId();
-      auto mlu_gen_random = GetMLURandomGenerator(ctx, device_id, seed_data);
-
-      // compute out = input * mask / ( 1.0 - dropout_prob )
-      MLUCnnl::FusedDropout(ctx,
-                            mlu_gen_random->get(),
-                            x_desc.get(),
-                            GetBasePtr(x),
-                            dropout_prob,
-                            GetBasePtr(&(mlu_gen_random->get_state())),
-                            mask_desc.get(),
-                            GetBasePtr(mask),
-                            out_desc.get(),
-                            GetBasePtr(out));
-
-      if (is_upscale) {
-        return;
-      }
-    }
-
-    // In downgrade_in_infer mode, need to multiply (1.0f - dropout_prob).
-    phi::DenseTensor scale_tensor(x->dtype());
-    phi::DenseTensor bias_tensor(x->dtype());
-    scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
-    bias_tensor.mutable_data<T>({1}, ctx.GetPlace());
-    MLUCnnlTensorDesc scale_desc(scale_tensor);
-    MLUCnnlTensorDesc bias_desc(bias_tensor);
-    FillMLUTensorWithHostValue(
-        ctx, static_cast<T>(1.0f - dropout_prob), &scale_tensor);
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0f), &bias_tensor);
-
-    MLUCnnl::Scale(ctx,
-                   0,
-                   is_test ? x_desc.get() : out_desc.get(),
-                   is_test ? GetBasePtr(x) : GetBasePtr(out),
-                   scale_desc.get(),
-                   GetBasePtr(&scale_tensor),
-                   bias_desc.get(),
-                   GetBasePtr(&bias_tensor),
-                   out_desc.get(),
-                   GetBasePtr(out));
-  }
-};
-
-template <typename T>
-class DropoutGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(!ctx.Attr<bool>("is_test"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "GradOp is only callable when is_test is false"));
-    auto* grad_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* grad_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
-    auto dropout_prob = ctx.Attr<float>("dropout_prob");
-    auto dropout_impl = ctx.Attr<std::string>("dropout_implementation");
-
-    grad_x->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc grad_x_desc(*grad_x);
-
-    if (dropout_prob == 1.) {
-      auto value_t = static_cast<T>(0.0f);
-      MLUCnnl::Fill(ctx,
-                    CNNL_POINTER_MODE_HOST,
-                    &value_t,
-                    grad_x_desc.get(),
-                    GetBasePtr(grad_x));
-      return;
-    }
-
-    // cast mask from uint8 to float32/float16
-    phi::DenseTensor cast_mask(grad_x->dtype());
-    cast_mask.Resize(mask->dims());
-    cast_mask.mutable_data<T>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc mask_desc(*mask);
-    MLUCnnlTensorDesc cast_mask_desc(cast_mask);
-    cnnlCastDataType_t cast_type =
-        GetCastDataType(framework::TransToProtoVarType(mask->dtype()),
-                        framework::TransToProtoVarType(cast_mask.dtype()));
-
-    MLUCnnl::Cast(ctx,
-                  cast_type,
-                  mask_desc.get(),
-                  GetBasePtr(mask),
-                  cast_mask_desc.get(),
-                  GetBasePtr(&cast_mask));
-
-    const bool is_upscale = (dropout_impl == "upscale_in_train");
-    const float scale = is_upscale ? (1.0f / (1.0f - dropout_prob)) : (1.0f);
-
-    auto data_type = ToCnnlDataType<T>();
-    MLUCnnlTensorDesc grad_out_desc(*grad_out);
-    MLUCnnlOpTensorDesc op_tensor_desc(
-        CNNL_OP_TENSOR_MUL, data_type, CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx,
-                      op_tensor_desc.get(),
-                      cast_mask_desc.get(),
-                      GetBasePtr(&cast_mask),
-                      grad_out_desc.get(),
-                      GetBasePtr(grad_out),
-                      grad_x_desc.get(),
-                      GetBasePtr(grad_x),
-                      data_type,
-                      scale);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(dropout,
-                       ops::DropoutMLUKernel<float>,
-                       ops::DropoutMLUKernel<plat::float16>);
-
-REGISTER_OP_MLU_KERNEL(dropout_grad,
-                       ops::DropoutGradMLUKernel<float>,
-                       ops::DropoutGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/expand_as_v2_op_mlu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/expand_as_v2_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ExpandAsV2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto target_rank = target_shape.size();
-    PADDLE_ENFORCE_GE(target_rank,
-                      rank,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be greater than or equal to "
-                          "the rank (%d) of the input 'x'.",
-                          target_rank,
-                          rank));
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument("The rank (%d) of the input 'x' for "
-                                          "expand_as_v2 op must be positive.",
-                                          rank));
-    PADDLE_ENFORCE_LE(target_rank,
-                      MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be less than or equal to %d.",
-                          target_rank,
-                          MAX_RANK_SUPPORTED));
-    ExpandAs(context);
-  }
-
- protected:
-  void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto in_dims = in0->dims();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(target_shape[i],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The value of target shape cannot be zero."));
-      if (vec_in_dims[i] != 1) {
-        PADDLE_ENFORCE_EQ(
-            vec_in_dims[i],
-            target_shape[i],
-            platform::errors::InvalidArgument(
-                "The value (%d) of the non-singleton dimension does not match"
-                " the corresponding value (%d) in "
-                "target tensor for expand_as_v2 op.",
-                vec_in_dims[i],
-                target_shape[i]));
-      }
-    }
-    auto* out0 = context.Output<phi::DenseTensor>("Out");
-
-    framework::DDim out_dims = phi::make_ddim(target_shape);
-
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(context.GetPlace());
-
-    MLUCnnlTensorDesc x_desc(*in0);
-    MLUCnnlTensorDesc out_desc(*out0);
-
-    MLUCnnl::BroadcastTo(context,
-                         x_desc.get(),
-                         GetBasePtr(in0),
-                         out_desc.get(),
-                         GetBasePtr(out0));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(expand_as_v2,
-                       ops::ExpandAsV2MLUKernel<float>,
-                       ops::ExpandAsV2MLUKernel<int>,
-                       ops::ExpandAsV2MLUKernel<int64_t>,
-                       ops::ExpandAsV2MLUKernel<int8_t>,
-                       ops::ExpandAsV2MLUKernel<uint8_t>,
-                       ops::ExpandAsV2MLUKernel<bool>,
-                       ops::ExpandAsV2MLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/expand_v2_op_mlu.cc
+++ b/paddle/fluid/operators/expand_v2_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_MLU
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/expand_v2_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ExpandV2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-    auto in_dims = X->dims();
-    auto expand_shape = get_expand_shape(ctx);
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = expand_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> final_expand_shape(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(expand_shape[i],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The expanded size cannot be zero."));
-      if (i < diff) {  // expand_shape = [3,4,-1,-1], X = [10,2] -->
-                       // final_expand_shape = [3,4,10,2]
-        PADDLE_ENFORCE_GT(
-            expand_shape[i],
-            0,
-            platform::errors::InvalidArgument(
-                "The expanded size (%d) for non-existing dimensions must be "
-                "positive for expand_v2 op.",
-                expand_shape[i]));
-        final_expand_shape[i] = expand_shape[i];
-      } else if (expand_shape[i] > 0) {  // expand_shape = [3,4,10,4], X =
-                                         // [10,1] --> final_expand_shape =
-                                         // [3,4,10,4]
-        if (vec_in_dims[i] != 1) {
-          PADDLE_ENFORCE_EQ(
-              vec_in_dims[i],
-              expand_shape[i],
-              platform::errors::InvalidArgument(
-                  "The value (%d) of the non-singleton dimension does not match"
-                  " the corresponding value (%d) in shape for expand_v2 op.",
-                  vec_in_dims[i],
-                  expand_shape[i]));
-          final_expand_shape[i] = expand_shape[i];
-        } else {
-          final_expand_shape[i] = expand_shape[i];
-        }
-      } else {  // expand_shape = [3,4,-1,-1], X = [10,2] --> final_expand_shape
-                // = [3,4,10,2]
-        PADDLE_ENFORCE_EQ(
-            expand_shape[i],
-            -1,
-            platform::errors::InvalidArgument(
-                "When the value in shape is negative for expand_v2 op, "
-                "only -1 is supported, but the value received is %d.",
-                expand_shape[i]));
-        final_expand_shape[i] = vec_in_dims[i];
-      }
-    }
-
-    auto rank = X->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'X' for expand_v2_mlu op must be positive, "
-            "but the value received is %d.",
-            rank));
-    auto shape_size = final_expand_shape.size();
-    PADDLE_ENFORCE_GE(
-        shape_size,
-        rank,
-        platform::errors::InvalidArgument(
-            "The number (%d) of elements of 'shape' for expand_v2_mlu op must "
-            "be "
-            "greater than or equal to the rank (%d) of the input 'X'.",
-            shape_size,
-            rank));
-
-    framework::DDim out_dims = phi::make_ddim(final_expand_shape);
-    Out->Resize(out_dims);
-    auto place = ctx.GetPlace();
-    Out->mutable_data<T>(place);
-    MLUCnnlTensorDesc x_desc(*X);
-    MLUCnnlTensorDesc out_desc(*Out);
-    MLUCnnl::BroadcastTo(
-        ctx, x_desc.get(), GetBasePtr(X), out_desc.get(), GetBasePtr(Out));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(expand_v2,
-                       ops::ExpandV2MLUKernel<float>,
-                       ops::ExpandV2MLUKernel<paddle::platform::float16>,
-                       ops::ExpandV2MLUKernel<bool>,
-                       ops::ExpandV2MLUKernel<int>,
-                       ops::ExpandV2MLUKernel<int64_t>);
-
-#endif
--- a/paddle/fluid/operators/fill_any_like_op_mlu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class FillAnyLikeMLUKernel : public framework::OpKernel<T> {
- public:
-  using CommonType = typename std::common_type<
-      float,
-      typename std::conditional<std::is_same<T, platform::float16>::value,
-                                float,
-                                T>::type>::type;
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    float value = ctx.Attr<float>("value");
-
-    auto common_type_value = static_cast<CommonType>(value);
-
-    PADDLE_ENFORCE_EQ(
-        (common_type_value >=
-         static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-            (common_type_value <=
-             static_cast<CommonType>(std::numeric_limits<T>::max())),
-        true,
-        platform::errors::InvalidArgument(
-            "The filled value is out of range for target type, "
-            "current kernel type is %s, the range should between %f "
-            "and %f, but now value is %f.",
-            typeid(T).name(),
-            static_cast<CommonType>(std::numeric_limits<T>::lowest()),
-            static_cast<CommonType>(std::numeric_limits<T>::max()),
-            value));
-
-    PADDLE_ENFORCE_EQ(
-        std::isnan(value),
-        false,
-        platform::errors::InvalidArgument("The filled value is NaN."));
-
-    auto value_t = static_cast<T>(value);
-    MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
-
-    MLUCnnl::Fill(
-        ctx, CNNL_POINTER_MODE_HOST, &value_t, out_desc.get(), GetBasePtr(out));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(fill_any_like,
-                       ops::FillAnyLikeMLUKernel<int>,
-                       ops::FillAnyLikeMLUKernel<int64_t>,
-                       ops::FillAnyLikeMLUKernel<float>,
-                       ops::FillAnyLikeMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class FillConstantBatchSizeLikeOpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto float_value = ctx.Attr<float>("value");
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-    auto *in = ctx.Input<phi::DenseTensor>("Input");
-    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
-      // set the correct batch size for the phi::DenseTensor.
-      auto odims = out->dims();
-      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
-      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
-      out->mutable_data<T>(odims, ctx.GetPlace());
-    }
-
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream.
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<double>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<double>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        if (std::is_same<int64_t, T>::value) {
-          int64_t tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        } else {
-          double tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        }
-      }
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
-    if (cpu_place) {
-      auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      phi::funcs::SetConstant<phi::CPUContext, T> functor;
-      out->mutable_data(platform::CPUPlace(),
-                        framework::TransToPhiDataType(data_type));
-      functor(reinterpret_cast<const phi::CPUContext &>(dev_ctx),
-              out,
-              static_cast<T>(value));
-    } else {
-      out->mutable_data(ctx.GetPlace(),
-                        framework::TransToPhiDataType(data_type));
-      const T *value_data = &value;
-      cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST;
-      MLUCnnlTensorDesc output_desc(*out);
-      MLUCnnl::Fill(
-          ctx, pointer_mode, value_data, output_desc.get(), GetBasePtr(out));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(
-    fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpMLUKernel<int>,
-    ops::FillConstantBatchSizeLikeOpMLUKernel<float>,
-    ops::FillConstantBatchSizeLikeOpMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/fill_constant_op_mlu.cc
+++ b/paddle/fluid/operators/fill_constant_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/utils.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class FillConstantMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto float_value = ctx.Attr<float>("value");
-
-    auto *out_var = ctx.Output<phi::DenseTensor>("Out");
-
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream.
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<double>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<double>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        if (std::is_same<int64_t, T>::value) {
-          int64_t tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        } else {
-          double tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        }
-      }
-    }
-    const T *value_data = &value;
-    cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST;
-    if (ctx.HasInput("ValueTensor")) {
-      auto *value_tensor = ctx.Input<phi::DenseTensor>("ValueTensor");
-      PADDLE_ENFORCE_EQ(
-          value_tensor->numel(),
-          1,
-          platform::errors::InvalidArgument(
-              "When use phi::DenseTensor as value to set phi::DenseTensor "
-              "value in fill_cosntant, "
-              "value input(ValueTensor) size must be 1, but get %d",
-              value_tensor->numel()));
-      value_data = value_tensor->data<T>();
-      auto tmp_place = value_tensor->place();
-      if (platform::is_mlu_place(tmp_place)) {
-        pointer_mode = CNNL_POINTER_MODE_DEVICE;
-      }
-    }
-
-    auto shape = GetShape(ctx);
-    out_var->mutable_data<T>(shape, ctx.GetPlace());
-    MLUCnnlTensorDesc output_desc(*out_var);
-    MLUCnnl::Fill(
-        ctx, pointer_mode, value_data, output_desc.get(), GetBasePtr(out_var));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_MLU_KERNEL(
-    fill_constant,
-    paddle::operators::FillConstantMLUKernel<float>,
-    paddle::operators::FillConstantMLUKernel<bool>,
-    paddle::operators::FillConstantMLUKernel<int>,
-    paddle::operators::FillConstantMLUKernel<uint8_t>,
-    paddle::operators::FillConstantMLUKernel<int16_t>,
-    paddle::operators::FillConstantMLUKernel<int64_t>,
-    paddle::operators::FillConstantMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/flatten_op_mlu.cc
+++ b/paddle/fluid/operators/flatten_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/flatten_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FlattenMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<phi::DenseTensor>("X");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-
-    auto &axes = context.Attr<int>("axis");
-    auto x_dims = in->dims();
-    auto out_dims = phi::make_ddim(GetOutputShape(axes, x_dims));
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        out);
-    out->Resize(out_dims);
-  }
-
-  static std::vector<int32_t> GetOutputShape(const int axis,
-                                             const framework::DDim &in_dims) {
-    int64_t outer = 1, inner = 1;
-    for (int i = 0; i < in_dims.size(); ++i) {
-      if (i < axis) {
-        outer *= in_dims[i];
-      } else {
-        inner *= in_dims[i];
-      }
-    }
-    std::vector<int32_t> out_shape(2);
-    out_shape[0] = outer;
-    out_shape[1] = inner;
-    return out_shape;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FlattenGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto in_dims = ctx.Input<phi::DenseTensor>("X")->dims();
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::MLUDeviceContext>(),
-        d_x);
-    d_x->Resize(in_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Flatten2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &axes = context.Attr<int>("axis");
-
-    auto *in = context.Input<phi::DenseTensor>("X");
-    auto x_dims = in->dims();
-
-    auto *out = context.Output<phi::DenseTensor>("Out");
-
-    auto out_dims = phi::make_ddim(
-        FlattenMLUKernel<DeviceContext, T>::GetOutputShape(axes, x_dims));
-
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        out);
-    out->Resize(out_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Flatten2GradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        d_x);
-    d_x->Resize(x_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FlattenContiguousRangeMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<phi::DenseTensor>("X");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-    out->mutable_data(context.GetPlace(), in->type());
-    auto &start_axis = context.Attr<int>("start_axis");
-    auto &stop_axis = context.Attr<int>("stop_axis");
-
-    // make out dims
-    auto in_dims = in->dims();
-    auto out_dims =
-        phi::make_ddim(GetOutputShape(start_axis, stop_axis, in_dims));
-    framework::TensorCopy(
-        *in,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        out);
-    out->Resize(out_dims);
-  }
-  static std::vector<int32_t> GetOutputShape(const int start_axis,
-                                             const int stop_axis,
-                                             const framework::DDim &in_dims) {
-    int64_t outer = 1;
-    std::vector<int32_t> out_shape;
-    int in_dims_size = in_dims.size();
-    out_shape.reserve(in_dims_size - stop_axis + start_axis);
-    int real_start_axis = start_axis, real_stop_axis = stop_axis;
-    if (start_axis < 0) {
-      real_start_axis = start_axis + in_dims_size;
-    }
-    if (stop_axis < 0) {
-      real_stop_axis = stop_axis + in_dims_size;
-    }
-
-    for (int i = 0; i < real_start_axis; ++i) {
-      out_shape.push_back(in_dims[i]);
-    }
-    for (int i = real_start_axis; i <= real_stop_axis; i++) {
-      if (in_dims[i] == -1 || outer == -1) {
-        outer = -1;
-      } else {
-        outer *= in_dims[i];
-      }
-    }
-    out_shape.push_back(outer);
-    for (int i = real_stop_axis + 1; i < in_dims_size; i++) {
-      out_shape.push_back(in_dims[i]);
-    }
-
-    return out_shape;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FlattenContiguousRangeGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<paddle::platform::MLUDeviceContext>(),
-        d_x);
-    d_x->Resize(x_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_MLU_KERNEL(
-    flatten,
-    ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, double>,
-    ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
-    ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, int>,
-    ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, int8_t>,
-    ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    flatten_grad,
-    ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, double>,
-    ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
-    ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, int>,
-    ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, int8_t>,
-    ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    flatten2,
-    ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, double>,
-    ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
-    ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, int>,
-    ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, int8_t>,
-    ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    flatten2_grad,
-    ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, float>,
-    ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, double>,
-    ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
-    ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, int>,
-    ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, int8_t>,
-    ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    flatten_contiguous_range,
-    ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
-                                         float>,
-    ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
-                                         double>,
-    ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
-                                         uint8_t>,
-    ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
-                                         int>,
-    ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
-                                         int8_t>,
-    ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
-                                         int64_t>);
-REGISTER_OP_MLU_KERNEL(
-    flatten_contiguous_range_grad,
-    ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                             float>,
-    ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                             double>,
-    ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                             uint8_t>,
-    ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                             int>,
-    ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                             int8_t>,
-    ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                             int64_t>);
--- a/paddle/fluid/operators/gather_nd_op_mlu.cc
+++ b/paddle/fluid/operators/gather_nd_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GatherNdMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-    out->template mutable_data<T>(place);
-
-    if (x->numel() == 0) return;
-    if (index->numel() == 0) {
-      auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-      framework::TensorCopy(*x, place, dev_ctx, out);
-      return;
-    }
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc index_desc(*index);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::GatherNd(ctx,
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      index_desc.get(),
-                      GetBasePtr(index),
-                      out_desc.get(),
-                      GetBasePtr(out));
-  }
-};
-
-template <typename T>
-class GatherNdGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-
-    if (dx->numel() == 0) return;
-    if (index->numel() == 0) {
-      auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-      framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
-      return;
-    }
-
-    phi::DenseTensor tmp_tensor(index->type());
-    phi::DenseTensor tmp_tensor2(dout->type());
-    const auto index_dims = index->dims();
-    if (index_dims.size() == 1) {
-      tmp_tensor.ShareDataWith(*index);
-      std::vector<int64_t> new_dim = {1, index_dims[0]};
-      tmp_tensor.Resize(phi::make_ddim(new_dim));
-      index = &tmp_tensor;
-
-      tmp_tensor2.ShareDataWith(*dout);
-      std::vector<int64_t> new_dim2{1};
-      for (int i = index->numel(); i < x->dims().size(); i++) {
-        new_dim2.push_back(x->dims()[i]);
-      }
-      tmp_tensor2.Resize(phi::make_ddim(new_dim2));
-      dout = &tmp_tensor2;
-    }
-
-    dx->mutable_data<T>(ctx.GetPlace());
-    MLUCnnlTensorDesc dx_desc(*dx);
-    auto value = static_cast<T>(0);
-    MLUCnnl::Fill(
-        ctx, CNNL_POINTER_MODE_HOST, &value, dx_desc.get(), GetBasePtr(dx));
-
-    MLUCnnlTensorDesc index_desc(*index);
-    MLUCnnlTensorDesc dout_desc(*dout);
-
-    const cnnlScatterNdMode_t mode = CNNL_SCATTERND_ADD;
-    MLUCnnl::ScatterNd(ctx,
-                       mode,
-                       index_desc.get(),
-                       GetBasePtr(index),
-                       dout_desc.get(),
-                       GetBasePtr(dout),
-                       dx_desc.get(),
-                       GetBasePtr(dx),
-                       dx_desc.get(),
-                       GetBasePtr(dx));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(gather_nd,
-                       ops::GatherNdMLUKernel<float>,
-                       ops::GatherNdMLUKernel<paddle::platform::float16>);
-
-REGISTER_OP_MLU_KERNEL(gather_nd_grad,
-                       ops::GatherNdGradMLUKernel<paddle::platform::float16>,
-                       ops::GatherNdGradMLUKernel<float>);
--- a/paddle/fluid/operators/gather_op_mlu.cc
+++ b/paddle/fluid/operators/gather_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GatherOpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto axis = ctx.Attr<int>("axis");
-
-    const auto index_dims = index->dims();
-    if (index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(
-          index_dims[1],
-          1,
-          platform::errors::InvalidArgument(
-              "The last dim of index should be 1 when it is 2D, but we get %d",
-              index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          index_dims.size(),
-          1,
-          platform::errors::InvalidArgument(
-              "The index should be 1D, when it is not 2D, but we get %d",
-              index_dims.size()));
-    }
-
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc x_desc(*x);
-    int index_shape_1d[1] = {static_cast<int>(index_dims[0])};
-    MLUCnnlTensorDesc index_desc(
-        1, index_shape_1d, ToCnnlDataType(index->dtype()));
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::GatherFunctor(ctx,
-                           axis,
-                           0 /*batch_dims*/,
-                           x_desc.get(),
-                           GetBasePtr(x),
-                           index_desc.get(),
-                           GetBasePtr(index),
-                           out_desc.get(),
-                           GetBasePtr(out));
-  }
-};
-
-template <typename T>
-class GatherGradOpMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    const auto index_dims = index->dims();
-    if (index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(
-          index_dims[1],
-          1,
-          platform::errors::InvalidArgument(
-              "The last dim of index should be 1 when it is 2D, but we get %d",
-              index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          index_dims.size(),
-          1,
-          platform::errors::InvalidArgument(
-              "The index should be 1D, when it is not 2D, but we get %d",
-              index_dims.size()));
-    }
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc dx_desc(*dx);
-    auto value = static_cast<T>(0);
-    MLUCnnl::Fill(
-        ctx, CNNL_POINTER_MODE_HOST, &value, dx_desc.get(), GetBasePtr(dx));
-
-    int index_shape_1d[1] = {static_cast<int>(index_dims[0])};
-    MLUCnnlTensorDesc index_desc(
-        1, index_shape_1d, ToCnnlDataType(index->dtype()));
-    MLUCnnlTensorDesc dout_desc(*dout);
-    const cnnlScatterRefMode_t mode = CNNL_SCATTERREF_UPDATE;
-    MLUCnnl::ScatterRefFunctor(ctx,
-                               dx_desc.get(),
-                               GetBasePtr(dx),
-                               dout_desc.get(),
-                               GetBasePtr(dout),
-                               index_desc.get(),
-                               GetBasePtr(index),
-                               mode);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(gather,
-                       ops::GatherOpMLUKernel<float>,
-                       ops::GatherOpMLUKernel<paddle::platform::float16>,
-                       ops::GatherOpMLUKernel<int>);
-
-REGISTER_OP_MLU_KERNEL(gather_grad,
-                       ops::GatherGradOpMLUKernel<float>,
-                       ops::GatherGradOpMLUKernel<paddle::platform::float16>,
-                       ops::GatherGradOpMLUKernel<int>);
--- a/paddle/fluid/operators/gaussian_random_op_mlu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/generator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class MLUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<phi::DenseTensor>("Out");
-    tensor->mutable_data<T>(context.GetPlace());
-
-    phi::DenseTensor cpu_tensor(tensor->type());
-    cpu_tensor.Resize(tensor->dims());
-    T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-    std::normal_distribution<T> dist(mean, std);
-
-    int64_t size = tensor->numel();
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = phi::GetCPURandomEngine(seed);
-    for (int64_t i = 0; i < size; ++i) {
-      cpu_data[i] = dist(*engine);
-    }
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::MLUDeviceContext>();
-    framework::TensorCopy(cpu_tensor, context.GetPlace(), dev_ctx, tensor);
-    dev_ctx.Wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_MLU_KERNEL(gaussian_random, ops::MLUGaussianRandomKernel<float>);
--- a/paddle/fluid/operators/grid_sampler_op_mlu.cc
+++ b/paddle/fluid/operators/grid_sampler_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GridSamplerMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_mlu_place(ctx.GetPlace()),
-        true,
-        platform::errors::Unavailable("This kernel only runs on MLU."));
-
-    // input and output data
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor* grid = ctx.Input<phi::DenseTensor>("Grid");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-
-    int n = input->dims()[0];
-    int c = input->dims()[1];
-    int out_h = grid->dims()[1];
-    int out_w = grid->dims()[2];
-
-    output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-
-    // attrs
-    // paddle.nn.functional.grid_sample(x, grid, mode='bilinear',
-    // padding_mode='zeros', align_corners=True, name=None)
-    const std::string mode = ctx.Attr<std::string>("mode");
-    const std::string padding_mode = ctx.Attr<std::string>("padding_mode");
-    bool align_corners = ctx.Attr<bool>("align_corners");
-    const std::string data_format = phi::DataLayoutToString(input->layout());
-
-    PADDLE_ENFORCE_EQ(
-        mode == "bilinear",
-        true,
-        platform::errors::Unavailable(
-            "Only support bilinear mode in mlu grid_sample kernel."));
-    PADDLE_ENFORCE_EQ(
-        padding_mode == "zeros",
-        true,
-        platform::errors::Unavailable(
-            "Only support zeros padding_mode in mlu grid_sample kernel."));
-
-    phi::DenseTensor trans_input(input->dtype());
-    // transpose input from NCHW to NHWC
-    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    TransposeFromMLUTensor<T>(
-        ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
-
-    phi::DenseTensor tmp_output(output->dtype());
-    tmp_output.mutable_data<T>({n, out_h, out_w, c}, ctx.GetPlace());
-
-    MLUCnnlGridSampleDesc grid_sample_desc(mode, padding_mode, align_corners);
-    MLUCnnlTensorDesc input_desc(
-        trans_input, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
-    MLUCnnlTensorDesc grid_desc(*grid, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
-    MLUCnnlTensorDesc tmp_output_desc(
-        tmp_output, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
-
-    MLUCnnl::GridSample(ctx,
-                        grid_sample_desc.get(),
-                        input_desc.get(),
-                        GetBasePtr(&trans_input),
-                        grid_desc.get(),
-                        GetBasePtr(grid),
-                        tmp_output_desc.get(),
-                        GetBasePtr(&tmp_output));
-
-    // transpose output from NHWC to NCHW
-    const std::vector<int> perm_to_nchw = {
-        0,
-        3,
-        1,
-        2,
-    };
-    TransposeFromMLUTensor<T>(ctx,
-                              perm_to_nchw,
-                              &tmp_output,
-                              output,
-                              false /*need_reshape_or_alloc*/);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(grid_sampler,
-                       ops::GridSamplerMLUKernel<float>,
-                       ops::GridSamplerMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/huber_loss_op_mlu.cc
+++ b/paddle/fluid/operators/huber_loss_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class HuberLossMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = GetDevCtxFromCTX(ctx);
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* residual = ctx.Output<phi::DenseTensor>("Residual");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto delta = ctx.Attr<float>("delta");
-
-    auto place = ctx.GetPlace();
-
-    // compute y-x
-    cnnlDataType_t data_type = ToCnnlDataType<T>();
-    residual->mutable_data<T>(x->dims(), place);
-    MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlOpTensorDesc sub_op_desc(
-        CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx,
-                      sub_op_desc.get(),
-                      x_desc.get(),
-                      GetBasePtr(y),
-                      x_desc.get(),
-                      GetBasePtr(x),
-                      x_desc.get(),
-                      GetBasePtr(residual),
-                      data_type);
-
-    // compute smoothl1loss
-    out->mutable_data<T>(x->dims(), place);
-    cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
-        CNNL_SMOOTHL1LOSS_REDUCTION_NONE;  // defines whether to do reduction
-                                           // here
-    MLUCnnl::SmoothL1LossForward(ctx,
-                                 x_desc.get(),
-                                 GetBasePtr(x),
-                                 x_desc.get(), /* target has same shape as x */
-                                 GetBasePtr(y),
-                                 static_cast<float>(delta),
-                                 smoothl1_algo,
-                                 x_desc.get(), /* out has same shape as x */
-                                 GetBasePtr(out));
-
-    // compute multiply by delta
-    phi::DenseTensor scale_tensor, bias_tensor;
-    scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
-    const int axis = std::max(out->dims().size() - 1, 0);
-
-    MLUCnnlTensorDesc scale_desc(scale_tensor);
-    MLUCnnlTensorDesc bias_desc(bias_tensor);
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnl::Scale(ctx,
-                   axis,
-                   out_desc.get(),
-                   GetBasePtr(out),
-                   scale_desc.get(),
-                   GetBasePtr(&scale_tensor),
-                   bias_desc.get(),
-                   GetBasePtr(&bias_tensor),
-                   out_desc.get(),
-                   GetBasePtr(out));
-  }
-};
-
-template <typename T>
-class HuberLossGradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = GetDevCtxFromCTX(ctx);
-    auto* residual = ctx.Input<phi::DenseTensor>("Residual");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto delta = ctx.Attr<float>("delta");
-
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor t_grad_rd;
-    t_grad_rd =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
-    MLUCnnlTensorDesc t_grad_rd_desc(t_grad_rd);
-    if (dx || dy) {
-      phi::DenseTensor t_zero;
-      t_zero =
-          ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
-      FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &t_zero);
-
-      MLUCnnlTensorDesc residual_desc(*residual);
-      MLUCnnlTensorDesc dout_desc(*dout);
-
-      cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
-          CNNL_SMOOTHL1LOSS_REDUCTION_NONE;  // defines whether to do reduction
-                                             // here
-      MLUCnnl::SmoothL1LossBackward(ctx,
-                                    residual_desc.get(),
-                                    GetBasePtr(residual),
-                                    residual_desc.get(),
-                                    GetBasePtr(&t_zero),
-                                    dout_desc.get(),
-                                    GetBasePtr(dout),
-                                    static_cast<float>(delta),
-                                    smoothl1_algo,
-                                    t_grad_rd_desc.get(),
-                                    GetBasePtr(&t_grad_rd));
-    }
-    // compute multiply by delta
-    phi::DenseTensor scale_tensor, bias_tensor;
-    scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-
-    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
-    const int axis = std::max(t_grad_rd.dims().size() - 1, 0);
-
-    MLUCnnlTensorDesc scale_desc(scale_tensor);
-    MLUCnnlTensorDesc bias_desc(bias_tensor);
-
-    if (dx) {
-      dx->mutable_data<T>(place);
-      FillMLUTensorWithHostValue(ctx, static_cast<T>(-delta), &scale_tensor);
-      MLUCnnlTensorDesc out_desc(*dx);
-      MLUCnnl::Scale(ctx,
-                     axis,
-                     t_grad_rd_desc.get(),
-                     GetBasePtr(&t_grad_rd),
-                     scale_desc.get(),
-                     GetBasePtr(&scale_tensor),
-                     bias_desc.get(),
-                     GetBasePtr(&bias_tensor),
-                     out_desc.get(),
-                     GetBasePtr(dx));
-    }
-    if (dy) {
-      dy->mutable_data<T>(place);
-      FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
-      MLUCnnlTensorDesc out_desc(*dy);
-      MLUCnnl::Scale(ctx,
-                     axis,
-                     t_grad_rd_desc.get(),
-                     GetBasePtr(&t_grad_rd),
-                     scale_desc.get(),
-                     GetBasePtr(&scale_tensor),
-                     bias_desc.get(),
-                     GetBasePtr(&bias_tensor),
-                     out_desc.get(),
-                     GetBasePtr(dy));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(huber_loss,
-                       ops::HuberLossMLUKernel<float>,
-                       ops::HuberLossMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(huber_loss_grad,
-                       ops::HuberLossGradMLUKernel<float>,
-                       ops::HuberLossGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/interpolate_v2_op_mlu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_mlu.cc
--- a/paddle/fluid/operators/label_smooth_op_mlu.cc
+++ b/paddle/fluid/operators/label_smooth_op_mlu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class LabelSmoothMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<phi::DenseTensor>("X");
-    auto* dist_t = ctx.Input<phi::DenseTensor>("PriorDist");
-    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto epsilon_gt = 1.0f - epsilon;
-
-    if (in_t->numel() == 0) return;
-    out_t->mutable_data<T>(ctx.GetPlace());
-    auto label_dim = in_t->dims()[in_t->dims().size() - 1];
-
-    MLUCnnlTensorDesc x_desc(*in_t);
-    MLUCnnlTensorDesc out_desc(*out_t);
-    auto data_type = ToCnnlDataType<T>();
-    MLUCnnlOpTensorDesc op_tensor_desc(
-        CNNL_OP_TENSOR_ADD, data_type, CNNL_NOT_PROPAGATE_NAN);
-    if (ctx.HasInput("PriorDist")) {
-      MLUCnnlTensorDesc dist_desc(*dist_t);
-      MLUCnnl::OpTensor(ctx,
-                        op_tensor_desc.get(),
-                        x_desc.get(),
-                        GetBasePtr(in_t),
-                        dist_desc.get(),
-                        GetBasePtr(dist_t),
-                        out_desc.get(),
-                        GetBasePtr(out_t),
-                        data_type,
-                        epsilon_gt,
-                        epsilon);
-    } else {
-      auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
-      phi::DenseTensor dist_tensor =
-          ctx.AllocateTmpTensor<T, MLUDeviceContext>({1, label_dim}, dev_ctx);
-      MLUCnnlTensorDesc dist_desc(dist_tensor);
-      auto value = static_cast<T>(1.0f / label_dim);
-      MLUCnnl::Fill(ctx,
-                    CNNL_POINTER_MODE_HOST,
-                    &value,
-                    dist_desc.get(),
-                    GetBasePtr(&dist_tensor));
-      MLUCnnl::OpTensor(ctx,
-                        op_tensor_desc.get(),
-                        x_desc.get(),
-                        GetBasePtr(in_t),
-                        dist_desc.get(),
-                        GetBasePtr(&dist_tensor),
-                        out_desc.get(),
-                        GetBasePtr(out_t),
-                        data_type,
-                        epsilon_gt,
-                        epsilon);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(label_smooth,
-                       ops::LabelSmoothMLUKernel<float>,
-                       ops::LabelSmoothMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/layer_norm_op_mlu.cc
+++ b/paddle/fluid/operators/layer_norm_op_mlu.cc
--- a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class LookupTableV2MLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *ids_t = ctx.Input<phi::DenseTensor>("Ids");      // int tensor
-    auto *output_t = ctx.Output<phi::DenseTensor>("Out");  // float tensor
-    auto *table_t = ctx.Input<phi::DenseTensor>("W");
-    int padding_idx = static_cast<int>(ctx.Attr<int64_t>("padding_idx"));
-
-    auto *table_var = ctx.InputVar("W");
-    PADDLE_ENFORCE_EQ(
-        table_var->IsType<phi::DenseTensor>(),
-        true,
-        platform::errors::InvalidArgument("mlu only accept phi::DenseTensor"));
-    output_t->mutable_data<T>(ctx.GetPlace());
-
-    MLUCnnlTensorDesc ids_desc(*ids_t);
-    MLUCnnlTensorDesc table_desc(*table_t);
-    MLUCnnlTensorDesc output_desc(*output_t);
-
-    MLUCnnl::EmbeddingForward(ctx,
-                              padding_idx,
-                              table_desc.get(),
-                              GetBasePtr(table_t),
-                              ids_desc.get(),
-                              static_cast<const int *>(GetBasePtr(ids_t)),
-                              output_desc.get(),
-                              GetBasePtr(output_t));
-  }
-};
-
-template <typename T>
-class LookupTableV2GradMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *table_var = ctx.InputVar("W");
-    PADDLE_ENFORCE_EQ(
-        table_var->IsType<phi::DenseTensor>(),
-        true,
-        platform::errors::PermissionDenied(
-            "Unsupported Variable Type , idx in "
-            "LookupTableV2GradMLUKernel should be phi::DenseTensor."));
-    bool is_sparse = ctx.Attr<bool>("is_sparse");
-    PADDLE_ENFORCE_EQ(
-        is_sparse,
-        false,
-        platform::errors::InvalidArgument(
-            "LookupTableV2GradMLUKernel dose NOT support is_sparse = True."));
-    auto *ids_t = ctx.Input<phi::DenseTensor>("Ids");
-    auto *output_grad_t =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *table_grad_t =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("W"));
-    table_grad_t->mutable_data<T>(ctx.GetPlace());
-
-    int padding_idx = static_cast<int>(ctx.Attr<int64_t>("padding_idx"));
-
-    int64_t ids_numel = ids_t->numel();
-    PADDLE_ENFORCE_EQ(
-        ids_numel <= std::numeric_limits<int32_t>::max(),
-        true,
-        platform::errors::OutOfRange(
-            "Number of ids greater than int32_t::max , please check "
-            "number of ids in LookupTableV2GradMLUKernel."));
-
-    phi::DenseTensor ids_int32(ids_t->dtype());
-    if (ids_t->dtype() != DataType::INT32) {
-      ids_int32.mutable_data<int>(ids_t->dims(), ctx.GetPlace());
-      MLUCnnlTensorDesc ids_desc(*ids_t);
-      MLUCnnlTensorDesc ids_int32_desc(ids_int32);
-      auto cast_type = GetCastDataType(ids_t->dtype(), DataType::INT32);
-      MLUCnnl::Cast(ctx,
-                    cast_type,
-                    ids_desc.get(),
-                    GetBasePtr(ids_t),
-                    ids_int32_desc.get(),
-                    GetBasePtr(&ids_int32));
-    } else {
-      ids_int32 = *ids_t;
-    }
-
-    MLUCnnlTensorDesc ids_int32_desc(ids_int32);
-    MLUCnnlTensorDesc output_grad_desc(*output_grad_t);
-    MLUCnnlTensorDesc table_grad_desc(*table_grad_t);
-
-    MLUCnnl::EmbeddingBackward(ctx,
-                               padding_idx,
-                               false,
-                               ids_int32_desc.get(),
-                               GetBasePtr(&ids_int32),
-                               output_grad_desc.get(),
-                               GetBasePtr(output_grad_t),
-                               table_grad_desc.get(),
-                               GetBasePtr(table_grad_t));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_MLU_KERNEL(lookup_table_v2,
-                       ops::LookupTableV2MLUKernel<float>,
-                       ops::LookupTableV2MLUKernel<int>,
-                       ops::LookupTableV2MLUKernel<plat::float16>);
-
-REGISTER_OP_MLU_KERNEL(lookup_table_v2_grad,
-                       ops::LookupTableV2GradMLUKernel<float>,
-                       ops::LookupTableV2GradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/masked_select_op_mlu.cc
+++ b/paddle/fluid/operators/masked_select_op_mlu.cc
--- a/paddle/fluid/operators/matmul_op_mlu.cc
+++ b/paddle/fluid/operators/matmul_op_mlu.cc
--- a/paddle/fluid/operators/matmul_v2_op_mlu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_mlu.cc
--- a/paddle/fluid/operators/mean_op_mlu.cc
+++ b/paddle/fluid/operators/mean_op_mlu.cc
--- a/paddle/fluid/operators/meshgrid_op_mlu.cc
+++ b/paddle/fluid/operators/meshgrid_op_mlu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class MeshgridMLUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    PADDLE_ENFORCE_EQ((ins.size() > 1) && (ins.size() < 7),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Excepted phi::DenseTensor numbers between 2 and 6, "
-                          "but only received d% .",
-                          ins.size()));
-
-    int64_t size = ins.size();
-    std::vector<int64_t> shape(size);
-
-    for (int64_t i = 0; i < size; i++) {
-      switch (ins[i]->dims().size()) {
-        case 0:
-          shape[i] = 1;
-          break;
-        case 1:
-          shape[i] = ins[i]->dims()[0];
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Expected scalar or 1D tensor in the tensor list but got tensor "
-              "%d: ",
-              i));
-      }
-    }
-
-    MLUCnnlTensorDesc out_desc(size, shape.data(), ToCnnlDataType<T>());
-    framework::DDim out_dims = phi::make_ddim(shape);
-    for (int64_t i = 0; i < size; i++) {
-      std::vector<int64_t> view_shape(size, 1);
-      view_shape[i] = shape[i];
-
-      outs[i]->Resize(out_dims);
-      outs[i]->mutable_data<T>(ctx.GetPlace());
-
-      MLUCnnlTensorDesc in_desc(size, view_shape.data(), ToCnnlDataType<T>());
-      MLUCnnl::BroadcastTo(ctx,
-                           in_desc.get(),
-                           GetBasePtr(ins[i]),
-                           out_desc.get(),
-                           GetBasePtr(outs[i]));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_MLU_KERNEL(
-    meshgrid,
-    paddle::operators::MeshgridMLUKernel<int>,
-    paddle::operators::MeshgridMLUKernel<float>,
-    paddle::operators::MeshgridMLUKernel<int64_t>,
-    paddle::operators::MeshgridMLUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/one_hot_v2_op_mlu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
--- a/paddle/fluid/operators/randperm_op_mlu.cc
+++ b/paddle/fluid/operators/randperm_op_mlu.cc
--- a/paddle/fluid/operators/range_op_mlu.cc
+++ b/paddle/fluid/operators/range_op_mlu.cc
--- a/paddle/fluid/operators/reshape_op_mlu.cc
+++ b/paddle/fluid/operators/reshape_op_mlu.cc
--- a/paddle/fluid/operators/rnn_op_mlu.cc
+++ b/paddle/fluid/operators/rnn_op_mlu.cc
--- a/paddle/fluid/operators/roi_align_op_mlu.cc
+++ b/paddle/fluid/operators/roi_align_op_mlu.cc
--- a/paddle/fluid/operators/scale_op_mlu.cc
+++ b/paddle/fluid/operators/scale_op_mlu.cc
--- a/paddle/fluid/operators/scatter_op_mlu.cc
+++ b/paddle/fluid/operators/scatter_op_mlu.cc
--- a/paddle/fluid/operators/set_value_op_mlu.cc
+++ b/paddle/fluid/operators/set_value_op_mlu.cc
--- a/paddle/fluid/operators/shape_op_mlu.cc
+++ b/paddle/fluid/operators/shape_op_mlu.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
--- a/paddle/fluid/operators/size_op_mlu.cc
+++ b/paddle/fluid/operators/size_op_mlu.cc
--- a/paddle/fluid/operators/slice_op_mlu.cc
+++ b/paddle/fluid/operators/slice_op_mlu.cc
--- a/paddle/fluid/operators/softmax_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_op_mlu.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
--- a/paddle/fluid/operators/split_op_mlu.cc
+++ b/paddle/fluid/operators/split_op_mlu.cc
--- a/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
--- a/paddle/fluid/operators/squeeze_op_mlu.cc
+++ b/paddle/fluid/operators/squeeze_op_mlu.cc
--- a/paddle/fluid/operators/stack_op_mlu.cc
+++ b/paddle/fluid/operators/stack_op_mlu.cc
--- a/paddle/fluid/operators/strided_slice_op_mlu.cc
+++ b/paddle/fluid/operators/strided_slice_op_mlu.cc
--- a/paddle/fluid/operators/sum_op_mlu.cc
+++ b/paddle/fluid/operators/sum_op_mlu.cc
--- a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
--- a/paddle/fluid/operators/tile_op_mlu.cc
+++ b/paddle/fluid/operators/tile_op_mlu.cc
--- a/paddle/fluid/operators/top_k_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_op_mlu.cc
--- a/paddle/fluid/operators/top_k_v2_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc
--- a/paddle/fluid/operators/transpose_op_mlu.cc
+++ b/paddle/fluid/operators/transpose_op_mlu.cc
--- a/paddle/fluid/operators/tril_triu_op_mlu.cc
+++ b/paddle/fluid/operators/tril_triu_op_mlu.cc
--- a/paddle/fluid/operators/truncated_gaussian_random_op_mlu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_mlu.cc
--- a/paddle/fluid/operators/uniform_random_op_mlu.cc
+++ b/paddle/fluid/operators/uniform_random_op_mlu.cc
--- a/paddle/fluid/operators/unsqueeze_op_mlu.cc
+++ b/paddle/fluid/operators/unsqueeze_op_mlu.cc
--- a/paddle/fluid/operators/unstack_op_mlu.cc
+++ b/paddle/fluid/operators/unstack_op_mlu.cc
--- a/paddle/fluid/operators/where_index_op_mlu.cc
+++ b/paddle/fluid/operators/where_index_op_mlu.cc
--- a/paddle/fluid/operators/where_op_mlu.cc
+++ b/paddle/fluid/operators/where_op_mlu.cc