[XPU] fix bugs: __xpu__conv2d, activation, elementwise (#4278)

[XPU] fix bugs: xpuconv2d, activation, elementwise (#4278)
194e5a76 · sunsetlh · GitHub · d91fdbb5 · 194e5a76 · 194e5a76
7 changed file
--- a/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc
@@ -244,6 +244,7 @@ class XPUConv2dBlock0Fuser : public FuseBase {

    std::string output_name = "";
    if (_with_relu) {
+      op_desc.SetAttr("act_type", std::string{"relu"});
      output_name = matched.at("relu_out")->arg()->name;
    } else {
      output_name = matched.at("bn_out")->arg()->name;
@@ -433,6 +434,7 @@ class XPUConv2dBlock1Fuser : public FuseBase {
        TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
    scope->NewTensor(max_output_name);
    op_desc.SetOutput("OutputMax", {max_output_name});
+    op_desc.SetAttr("act_type", std::string{"relu"});

    auto conv_op = LiteOpRegistry::Global().Create("__xpu__conv2d");
    auto& valid_places = conv_old->valid_places();

--- a/lite/kernels/xpu/__xpu__conv2d_compute.cc
+++ b/lite/kernels/xpu/__xpu__conv2d_compute.cc
@@ -48,8 +48,9 @@ void XPUConv2dCompute::Run() {
  std::string filter_type = param.filter_type;
  int groups = param.groups;

-  int act_type = (param.act_type == -1) ? xdnn::Activation_t::RELU
-                                        : param.act_type;  // -1 means not init
+  int act_type = (param.act_type == "relu")
+                     ? xdnn::Activation_t::RELU
+                     : xdnn::Activation_t::LINEAR;  // -1 means not init
  const auto* bias = param.Bias ? param.Bias->data<float>() : nullptr;
  const auto* branch = param.Branch ? param.Branch->data<float>() : nullptr;
  const float* input_max =
@@ -60,7 +61,6 @@ void XPUConv2dCompute::Run() {
  float* output = param.Output->mutable_data<float>(TARGET(kXPU));

  // TODO(luohang): now support for resnet50 first
-  CHECK_EQ(act_type, xdnn::Activation_t::RELU);
  CHECK_EQ(groups, 1);
  CHECK_EQ(filter_type, "int16");


--- a/lite/kernels/xpu/activation_compute.cc
+++ b/lite/kernels/xpu/activation_compute.cc
@@ -73,6 +73,19 @@ void AbsCompute::Run() {
  CHECK_EQ(r, 0);
 }

+void ExpCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),     /* context */
+      xdnn::Activation_t::EXP, /* type */
+      param.X->numel(),        /* len */
+      param.X->data<float>(),  /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
 void SquareCompute::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->As<XPUContext>();
@@ -86,6 +99,19 @@ void SquareCompute::Run() {
  CHECK_EQ(r, 0);
 }

+void ReciprocalCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),            /* context */
+      xdnn::Activation_t::RECIPROCAL, /* type */
+      param.X->numel(),               /* len */
+      param.X->data<float>(),         /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
 void SqrtCompute::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->As<XPUContext>();
@@ -103,11 +129,14 @@ void PowCompute::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->As<XPUContext>();

+  xdnn::Activation_t act_type(xdnn::Activation_t::ACT_POW);
+  act_type.pow_factor = param.factor;
+
  int r = xdnn::activation_forward(
-      ctx.GetRawContext(),         /* context */
-      xdnn::Activation_t::ACT_POW, /* type */
-      param.X->numel(),            /* len */
-      param.X->data<float>(),      /* x */
+      ctx.GetRawContext(),    /* context */
+      act_type,               /* type */
+      param.X->numel(),       /* len */
+      param.X->data<float>(), /* x */
      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
  CHECK_EQ(r, 0);
 }
@@ -158,6 +187,12 @@ REGISTER_LITE_KERNEL(
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
    .Finalize();

+REGISTER_LITE_KERNEL(
+    exp, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ExpCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
    square, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SquareCompute, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
@@ -181,3 +216,13 @@ REGISTER_LITE_KERNEL(
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
    .Finalize();
+
+REGISTER_LITE_KERNEL(reciprocal,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ReciprocalCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
@@ -13,7 +13,6 @@
 // limitations under the License.

 #pragma once
-
 #include "lite/core/kernel.h"

 namespace paddle {
@@ -57,6 +56,15 @@ class AbsCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
  virtual ~AbsCompute() = default;
 };

+class ExpCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~ExpCompute() = default;
+};
+
 class SquareCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
 public:
  using param_t = operators::ActivationParam;
@@ -66,6 +74,15 @@ class SquareCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
  virtual ~SquareCompute() = default;
 };

+class ReciprocalCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~ReciprocalCompute() = default;
+};
+
 class SqrtCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
 public:
  using param_t = operators::ActivationParam;
@@ -77,7 +94,7 @@ class SqrtCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {

 class PowCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
 public:
-  using param_t = operators::ActivationParam;
+  using param_t = operators::PowParam;

  virtual void Run();


--- a/lite/kernels/xpu/elementwise_compute.cc
+++ b/lite/kernels/xpu/elementwise_compute.cc
@@ -13,8 +13,12 @@
 // limitations under the License.

 #include "lite/kernels/xpu/elementwise_compute.h"
+#include <algorithm>
 #include <functional>
+#include <string>
+#include <vector>
 #include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"

 namespace paddle {
@@ -22,113 +26,300 @@ namespace lite {
 namespace kernels {
 namespace xpu {

+inline DDim TrimTrailingSingularDims(const DDim& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  DDim actual_dims = DDim(trim_dims);
+  return actual_dims;
+}
+
+inline void GetMidDims(const DDim& x_dims,
+                       const DDim& y_dims,
+                       const int axis,
+                       int* pre,
+                       int* n,
+                       int* post,
+                       int* mid_flag = NULL) {
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  if (mid_flag != NULL) {
+    *mid_flag = 0;
+    int mid = 0;
+    for (int i = 0; i < axis; ++i) {
+      (*pre) *= x_dims[i];
+    }
+    for (int i = 0; i < y_dims.size(); ++i) {
+      if (x_dims[i + axis] != y_dims[i]) {
+        // only support single y_dims[i] = 1 now.
+        CHECK_EQ(*mid_flag, 0) << "Broadcast support y_dims with single 1.";
+        CHECK_EQ(y_dims[i], 1) << "Broadcast dimension mismatch.";
+        // m*n*k m*1*k
+        for (int j = 0; j < i; ++j) {
+          (*pre) *= y_dims[j];
+        }
+        *n = std::max(x_dims[i + axis], y_dims[i]);
+        *mid_flag = 1;
+        mid = i;
+        break;
+      }
+      (*n) *= y_dims[i];
+    }
+    if (*mid_flag) {
+      for (int i = mid + 1; i < x_dims.size(); ++i) {
+        (*post) *= x_dims[i];
+      }
+    } else {
+      for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+        (*post) *= x_dims[i];
+      }
+    }
+  } else {
+    for (int i = 0; i < axis; ++i) {
+      (*pre) *= x_dims[i];
+    }
+
+    for (int i = 0; i < y_dims.size(); ++i) {
+      CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch.";
+      (*n) *= y_dims[i];
+    }
+
+    for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+      (*post) *= x_dims[i];
+    }
+  }
+}
+
 void ElementwiseAddCompute::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->As<XPUContext>();

-  auto& x_dims = param.X->dims().data();
+  auto& x_dims = param.X->dims();
  auto& y_dims = param.Y->dims();
  int axis = param.axis;
-  if (param.axis == -1) {
-    axis = x_dims.size() - y_dims.size();
+
+  auto y_dims_untrimed = y_dims;
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
+  auto y_dims_after_trailing = TrimTrailingSingularDims(y_dims_untrimed);
+  axis = (y_dims_after_trailing.size() == 0) ? x_dims.size() : axis;
+  int pre, n, post;
+  GetMidDims(x_dims, y_dims_after_trailing, axis, &pre, &n, &post);
+  int len = pre * n * post;
+  float* y_broadcast = nullptr;
+
+  if (post == 1) {
+    int r =
+        xdnn::matrix_vector_add(ctx.GetRawContext(),
+                                param.X->data<float>(),
+                                param.Y->data<float>(),
+                                param.Out->mutable_data<float>(TARGET(kXPU)),
+                                pre,
+                                n);
+    CHECK_EQ(r, 0);
+    return;
  }
-  int iter = std::accumulate(
-      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
-  int stride = param.Y->numel();
-
-  for (int i = 0; i < iter; ++i) {
-    const float* x_ptr = param.X->data<float>() + i * stride;
-    const float* y_ptr = param.Y->data<float>();
-    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
-    int r = xdnn::elementwise_add(ctx.GetRawContext(), /* context */
-                                  x_ptr,               /* x */
-                                  y_ptr,               /* y */
-                                  o_ptr,               /* z */
-                                  stride /* len */);
+  if (pre != 1 || post != 1) {
+    XPUScratchPadGuard y_broadcast_xpu_guard_ =
+        TargetWrapperXPU::MallocScratchPad(len * sizeof(float),
+                                           false /* use_l3 */);
+    y_broadcast = reinterpret_cast<float*>(y_broadcast_xpu_guard_->addr_);
+
+    int r = xdnn::broadcast_ew(ctx.GetRawContext(),
+                               param.Y->data<float>(),
+                               y_broadcast,
+                               pre,
+                               n,
+                               post,
+                               xdnn::ElementwiseOp::ASSIGN);
+    CHECK_EQ(r, 0);
+    r = xdnn::elementwise_add(
+        ctx.GetRawContext(),                          /* context */
+        param.X->data<float>(),                       /* x */
+        y_broadcast,                                  /* y */
+        param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+        len);
    CHECK_EQ(r, 0);
+    return;
  }
+  int r = xdnn::elementwise_add(
+      ctx.GetRawContext(),                          /* context */
+      param.X->data<float>(),                       /* x */
+      param.Y->data<float>(),                       /* y */
+      param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+      len);
+  CHECK_EQ(r, 0);
 }

-void ElementwiseSubCompute::Run() {
+void ElementwiseMulCompute::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->As<XPUContext>();

-  auto& x_dims = param.X->dims().data();
+  auto& x_dims = param.X->dims();
  auto& y_dims = param.Y->dims();
  int axis = param.axis;
-  if (param.axis == -1) {
-    axis = x_dims.size() - y_dims.size();
+
+  auto y_dims_untrimed = y_dims;
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
+  auto y_dims_after_trailing = TrimTrailingSingularDims(y_dims_untrimed);
+  axis = (y_dims_after_trailing.size() == 0) ? x_dims.size() : axis;
+  int pre, n, post;
+  GetMidDims(x_dims, y_dims_after_trailing, axis, &pre, &n, &post);
+  int len = pre * n * post;
+  float* y_broadcast = nullptr;
+
+  if (post == 1) {
+    int r =
+        xdnn::matrix_vector_mul(ctx.GetRawContext(),
+                                param.X->data<float>(),
+                                param.Y->data<float>(),
+                                param.Out->mutable_data<float>(TARGET(kXPU)),
+                                pre,
+                                n);
+    CHECK_EQ(r, 0);
+    return;
  }
-  int iter = std::accumulate(
-      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
-  int stride = param.Y->numel();
-
-  for (int i = 0; i < iter; ++i) {
-    const float* x_ptr = param.X->data<float>() + i * stride;
-    const float* y_ptr = param.Y->data<float>();
-    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
-    int r = xdnn::elementwise_sub(ctx.GetRawContext(), /* context */
-                                  x_ptr,               /* x */
-                                  y_ptr,               /* y */
-                                  o_ptr,               /* z */
-                                  stride /* len */);
+  if (pre != 1 || post != 1) {
+    XPUScratchPadGuard y_broadcast_xpu_guard_ =
+        TargetWrapperXPU::MallocScratchPad(len * sizeof(float),
+                                           false /* use_l3 */);
+    y_broadcast = reinterpret_cast<float*>(y_broadcast_xpu_guard_->addr_);
+
+    int r = xdnn::broadcast_ew(ctx.GetRawContext(),
+                               param.Y->data<float>(),
+                               y_broadcast,
+                               pre,
+                               n,
+                               post,
+                               xdnn::ElementwiseOp::ASSIGN);
    CHECK_EQ(r, 0);
+    r = xdnn::elementwise_mul(
+        ctx.GetRawContext(),                          /* context */
+        param.X->data<float>(),                       /* x */
+        y_broadcast,                                  /* y */
+        param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+        len);
+    CHECK_EQ(r, 0);
+    return;
  }
+  int r = xdnn::elementwise_mul(
+      ctx.GetRawContext(),                          /* context */
+      param.X->data<float>(),                       /* x */
+      param.Y->data<float>(),                       /* y */
+      param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+      len);
+  CHECK_EQ(r, 0);
 }

-void ElementwiseDivCompute::Run() {
+void ElementwiseSubCompute::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->As<XPUContext>();

-  auto& x_dims = param.X->dims().data();
+  auto& x_dims = param.X->dims();
  auto& y_dims = param.Y->dims();
  int axis = param.axis;
-  if (param.axis == -1) {
-    axis = x_dims.size() - y_dims.size();
-  }
-  int iter = std::accumulate(
-      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
-  int stride = param.Y->numel();
-
-  for (int i = 0; i < iter; ++i) {
-    const float* x_ptr = param.X->data<float>() + i * stride;
-    const float* y_ptr = param.Y->data<float>();
-    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
-    int r = xdnn::elementwise_div(ctx.GetRawContext(), /* context */
-                                  x_ptr,               /* x */
-                                  y_ptr,               /* y */
-                                  o_ptr,               /* z */
-                                  stride /* len */);
+
+  auto y_dims_untrimed = y_dims;
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
+  auto y_dims_after_trailing = TrimTrailingSingularDims(y_dims_untrimed);
+  axis = (y_dims_after_trailing.size() == 0) ? x_dims.size() : axis;
+  int pre, n, post;
+  GetMidDims(x_dims, y_dims_after_trailing, axis, &pre, &n, &post);
+  int len = pre * n * post;
+  float* y_broadcast = nullptr;
+
+  if (len != param.Y->numel()) {
+    XPUScratchPadGuard y_broadcast_xpu_guard_ =
+        TargetWrapperXPU::MallocScratchPad(len * sizeof(float),
+                                           false /* use_l3 */);
+    y_broadcast = reinterpret_cast<float*>(y_broadcast_xpu_guard_->addr_);
+
+    int r = xdnn::broadcast_ew(ctx.GetRawContext(),
+                               param.Y->data<float>(),
+                               y_broadcast,
+                               pre,
+                               n,
+                               post,
+                               xdnn::ElementwiseOp::ASSIGN);
    CHECK_EQ(r, 0);
+    r = xdnn::elementwise_sub(
+        ctx.GetRawContext(),                          /* context */
+        param.X->data<float>(),                       /* x */
+        y_broadcast,                                  /* y */
+        param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+        len);
+    CHECK_EQ(r, 0);
+    return;
  }
+  int r = xdnn::elementwise_sub(
+      ctx.GetRawContext(),                          /* context */
+      param.X->data<float>(),                       /* x */
+      param.Y->data<float>(),                       /* y */
+      param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+      len);
+  CHECK_EQ(r, 0);
 }

-void ElementwiseMulCompute::Run() {
+void ElementwiseDivCompute::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->As<XPUContext>();

-  auto& x_dims = param.X->dims().data();
+  auto& x_dims = param.X->dims();
  auto& y_dims = param.Y->dims();
  int axis = param.axis;
-  if (param.axis == -1) {
-    axis = x_dims.size() - y_dims.size();
-  }
-  int iter = std::accumulate(
-      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
-  int stride = param.Y->numel();
-
-  for (int i = 0; i < iter; ++i) {
-    const float* x_ptr = param.X->data<float>() + i * stride;
-    const float* y_ptr = param.Y->data<float>();
-    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
-    int r = xdnn::elementwise_mul(ctx.GetRawContext(), /* context */
-                                  x_ptr,               /* x */
-                                  y_ptr,               /* y */
-                                  o_ptr,               /* z */
-                                  stride /* len */);
+
+  auto y_dims_untrimed = y_dims;
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
+  auto y_dims_after_trailing = TrimTrailingSingularDims(y_dims_untrimed);
+  axis = (y_dims_after_trailing.size() == 0) ? x_dims.size() : axis;
+  int pre, n, post;
+  GetMidDims(x_dims, y_dims_after_trailing, axis, &pre, &n, &post);
+  int len = pre * n * post;
+  float* y_broadcast = nullptr;
+
+  if (len != param.Y->numel()) {
+    XPUScratchPadGuard y_broadcast_xpu_guard_ =
+        TargetWrapperXPU::MallocScratchPad(len * sizeof(float),
+                                           false /* use_l3 */);
+    y_broadcast = reinterpret_cast<float*>(y_broadcast_xpu_guard_->addr_);
+
+    int r = xdnn::broadcast_ew(ctx.GetRawContext(),
+                               param.Y->data<float>(),
+                               y_broadcast,
+                               pre,
+                               n,
+                               post,
+                               xdnn::ElementwiseOp::ASSIGN);
+    CHECK_EQ(r, 0);
+    r = xdnn::elementwise_div(
+        ctx.GetRawContext(),                          /* context */
+        param.X->data<float>(),                       /* x */
+        y_broadcast,                                  /* y */
+        param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+        len);
    CHECK_EQ(r, 0);
+    return;
  }
+  int r = xdnn::elementwise_div(
+      ctx.GetRawContext(),                          /* context */
+      param.X->data<float>(),                       /* x */
+      param.Y->data<float>(),                       /* y */
+      param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+      len);
+  CHECK_EQ(r, 0);
 }
+
 }  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
@@ -145,33 +336,33 @@ REGISTER_LITE_KERNEL(elementwise_add,
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
    .Finalize();

-REGISTER_LITE_KERNEL(elementwise_sub,
+REGISTER_LITE_KERNEL(elementwise_mul,
                     kXPU,
                     kFloat,
                     kNCHW,
-                     paddle::lite::kernels::xpu::ElementwiseSubCompute,
+                     paddle::lite::kernels::xpu::ElementwiseMulCompute,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
    .Finalize();

-REGISTER_LITE_KERNEL(elementwise_div,
+REGISTER_LITE_KERNEL(elementwise_sub,
                     kXPU,
                     kFloat,
                     kNCHW,
-                     paddle::lite::kernels::xpu::ElementwiseDivCompute,
+                     paddle::lite::kernels::xpu::ElementwiseSubCompute,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
    .Finalize();

-REGISTER_LITE_KERNEL(elementwise_mul,
+REGISTER_LITE_KERNEL(elementwise_div,
                     kXPU,
                     kFloat,
                     kNCHW,
-                     paddle::lite::kernels::xpu::ElementwiseMulCompute,
+                     paddle::lite::kernels::xpu::ElementwiseDivCompute,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})

--- a/lite/operators/__xpu__conv2d_op.cc
+++ b/lite/operators/__xpu__conv2d_op.cc
@@ -138,7 +138,7 @@ bool XPUConv2dOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
  param_.dilations = std::make_shared<std::vector<int>>(dilations);
  param_.groups = op_desc.GetAttr<int>("groups");
  if (op_desc.HasAttr("act_type")) {
-    param_.act_type = op_desc.GetAttr<int>("act_type");
+    param_.act_type = op_desc.GetAttr<std::string>("act_type");
  }

  if (op_desc.HasAttr("filter_type")) {

--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -1836,7 +1836,7 @@ struct XPUConv2dParam : ParamBase {
  lite::Tensor* OutputMax{nullptr};

  int groups{1};
-  int act_type{-1};
+  std::string act_type{""};
  std::string filter_type{""};
  std::vector<int> strides;
  std::shared_ptr<std::vector<int>> paddings;