未验证 提交 194e5a76 编写于 作者: S sunsetlh 提交者: GitHub

[XPU] fix bugs: __xpu__conv2d, activation, elementwise (#4278)

上级 d91fdbb5
......@@ -244,6 +244,7 @@ class XPUConv2dBlock0Fuser : public FuseBase {
std::string output_name = "";
if (_with_relu) {
op_desc.SetAttr("act_type", std::string{"relu"});
output_name = matched.at("relu_out")->arg()->name;
} else {
output_name = matched.at("bn_out")->arg()->name;
......@@ -433,6 +434,7 @@ class XPUConv2dBlock1Fuser : public FuseBase {
TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
scope->NewTensor(max_output_name);
op_desc.SetOutput("OutputMax", {max_output_name});
op_desc.SetAttr("act_type", std::string{"relu"});
auto conv_op = LiteOpRegistry::Global().Create("__xpu__conv2d");
auto& valid_places = conv_old->valid_places();
......
......@@ -48,8 +48,9 @@ void XPUConv2dCompute::Run() {
std::string filter_type = param.filter_type;
int groups = param.groups;
int act_type = (param.act_type == -1) ? xdnn::Activation_t::RELU
: param.act_type; // -1 means not init
int act_type = (param.act_type == "relu")
? xdnn::Activation_t::RELU
: xdnn::Activation_t::LINEAR; // -1 means not init
const auto* bias = param.Bias ? param.Bias->data<float>() : nullptr;
const auto* branch = param.Branch ? param.Branch->data<float>() : nullptr;
const float* input_max =
......@@ -60,7 +61,6 @@ void XPUConv2dCompute::Run() {
float* output = param.Output->mutable_data<float>(TARGET(kXPU));
// TODO(luohang): now support for resnet50 first
CHECK_EQ(act_type, xdnn::Activation_t::RELU);
CHECK_EQ(groups, 1);
CHECK_EQ(filter_type, "int16");
......
......@@ -73,6 +73,19 @@ void AbsCompute::Run() {
CHECK_EQ(r, 0);
}
void ExpCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int r = xdnn::activation_forward(
ctx.GetRawContext(), /* context */
xdnn::Activation_t::EXP, /* type */
param.X->numel(), /* len */
param.X->data<float>(), /* x */
param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
CHECK_EQ(r, 0);
}
void SquareCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
......@@ -86,6 +99,19 @@ void SquareCompute::Run() {
CHECK_EQ(r, 0);
}
void ReciprocalCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int r = xdnn::activation_forward(
ctx.GetRawContext(), /* context */
xdnn::Activation_t::RECIPROCAL, /* type */
param.X->numel(), /* len */
param.X->data<float>(), /* x */
param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
CHECK_EQ(r, 0);
}
void SqrtCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
......@@ -103,11 +129,14 @@ void PowCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
xdnn::Activation_t act_type(xdnn::Activation_t::ACT_POW);
act_type.pow_factor = param.factor;
int r = xdnn::activation_forward(
ctx.GetRawContext(), /* context */
xdnn::Activation_t::ACT_POW, /* type */
param.X->numel(), /* len */
param.X->data<float>(), /* x */
ctx.GetRawContext(), /* context */
act_type, /* type */
param.X->numel(), /* len */
param.X->data<float>(), /* x */
param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
CHECK_EQ(r, 0);
}
......@@ -158,6 +187,12 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(
exp, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ExpCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(
square, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SquareCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
......@@ -181,3 +216,13 @@ REGISTER_LITE_KERNEL(
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(reciprocal,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::ReciprocalCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
......@@ -13,7 +13,6 @@
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
......@@ -57,6 +56,15 @@ class AbsCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
virtual ~AbsCompute() = default;
};
class ExpCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
virtual void Run();
virtual ~ExpCompute() = default;
};
class SquareCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
......@@ -66,6 +74,15 @@ class SquareCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
virtual ~SquareCompute() = default;
};
class ReciprocalCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
virtual void Run();
virtual ~ReciprocalCompute() = default;
};
class SqrtCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
......@@ -77,7 +94,7 @@ class SqrtCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
class PowCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
using param_t = operators::PowParam;
virtual void Run();
......
......@@ -13,8 +13,12 @@
// limitations under the License.
#include "lite/kernels/xpu/elementwise_compute.h"
#include <algorithm>
#include <functional>
#include <string>
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
namespace paddle {
......@@ -22,113 +26,300 @@ namespace lite {
namespace kernels {
namespace xpu {
inline DDim TrimTrailingSingularDims(const DDim& dims) {
// Remove trailing dimensions of size 1 for y
auto actual_dims_size = dims.size();
for (; actual_dims_size != 0; --actual_dims_size) {
if (dims[actual_dims_size - 1] != 1) break;
}
std::vector<int64_t> trim_dims;
trim_dims.resize(actual_dims_size);
for (int i = 0; i < actual_dims_size; ++i) {
trim_dims[i] = dims[i];
}
if (trim_dims.size() == 0) {
return DDim();
}
DDim actual_dims = DDim(trim_dims);
return actual_dims;
}
inline void GetMidDims(const DDim& x_dims,
const DDim& y_dims,
const int axis,
int* pre,
int* n,
int* post,
int* mid_flag = NULL) {
*pre = 1;
*n = 1;
*post = 1;
if (mid_flag != NULL) {
*mid_flag = 0;
int mid = 0;
for (int i = 0; i < axis; ++i) {
(*pre) *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
if (x_dims[i + axis] != y_dims[i]) {
// only support single y_dims[i] = 1 now.
CHECK_EQ(*mid_flag, 0) << "Broadcast support y_dims with single 1.";
CHECK_EQ(y_dims[i], 1) << "Broadcast dimension mismatch.";
// m*n*k m*1*k
for (int j = 0; j < i; ++j) {
(*pre) *= y_dims[j];
}
*n = std::max(x_dims[i + axis], y_dims[i]);
*mid_flag = 1;
mid = i;
break;
}
(*n) *= y_dims[i];
}
if (*mid_flag) {
for (int i = mid + 1; i < x_dims.size(); ++i) {
(*post) *= x_dims[i];
}
} else {
for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
(*post) *= x_dims[i];
}
}
} else {
for (int i = 0; i < axis; ++i) {
(*pre) *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch.";
(*n) *= y_dims[i];
}
for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
(*post) *= x_dims[i];
}
}
}
void ElementwiseAddCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.X->dims().data();
auto& x_dims = param.X->dims();
auto& y_dims = param.Y->dims();
int axis = param.axis;
if (param.axis == -1) {
axis = x_dims.size() - y_dims.size();
auto y_dims_untrimed = y_dims;
axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
auto y_dims_after_trailing = TrimTrailingSingularDims(y_dims_untrimed);
axis = (y_dims_after_trailing.size() == 0) ? x_dims.size() : axis;
int pre, n, post;
GetMidDims(x_dims, y_dims_after_trailing, axis, &pre, &n, &post);
int len = pre * n * post;
float* y_broadcast = nullptr;
if (post == 1) {
int r =
xdnn::matrix_vector_add(ctx.GetRawContext(),
param.X->data<float>(),
param.Y->data<float>(),
param.Out->mutable_data<float>(TARGET(kXPU)),
pre,
n);
CHECK_EQ(r, 0);
return;
}
int iter = std::accumulate(
x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
int stride = param.Y->numel();
for (int i = 0; i < iter; ++i) {
const float* x_ptr = param.X->data<float>() + i * stride;
const float* y_ptr = param.Y->data<float>();
float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
int r = xdnn::elementwise_add(ctx.GetRawContext(), /* context */
x_ptr, /* x */
y_ptr, /* y */
o_ptr, /* z */
stride /* len */);
if (pre != 1 || post != 1) {
XPUScratchPadGuard y_broadcast_xpu_guard_ =
TargetWrapperXPU::MallocScratchPad(len * sizeof(float),
false /* use_l3 */);
y_broadcast = reinterpret_cast<float*>(y_broadcast_xpu_guard_->addr_);
int r = xdnn::broadcast_ew(ctx.GetRawContext(),
param.Y->data<float>(),
y_broadcast,
pre,
n,
post,
xdnn::ElementwiseOp::ASSIGN);
CHECK_EQ(r, 0);
r = xdnn::elementwise_add(
ctx.GetRawContext(), /* context */
param.X->data<float>(), /* x */
y_broadcast, /* y */
param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
len);
CHECK_EQ(r, 0);
return;
}
int r = xdnn::elementwise_add(
ctx.GetRawContext(), /* context */
param.X->data<float>(), /* x */
param.Y->data<float>(), /* y */
param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
len);
CHECK_EQ(r, 0);
}
void ElementwiseSubCompute::Run() {
void ElementwiseMulCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.X->dims().data();
auto& x_dims = param.X->dims();
auto& y_dims = param.Y->dims();
int axis = param.axis;
if (param.axis == -1) {
axis = x_dims.size() - y_dims.size();
auto y_dims_untrimed = y_dims;
axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
auto y_dims_after_trailing = TrimTrailingSingularDims(y_dims_untrimed);
axis = (y_dims_after_trailing.size() == 0) ? x_dims.size() : axis;
int pre, n, post;
GetMidDims(x_dims, y_dims_after_trailing, axis, &pre, &n, &post);
int len = pre * n * post;
float* y_broadcast = nullptr;
if (post == 1) {
int r =
xdnn::matrix_vector_mul(ctx.GetRawContext(),
param.X->data<float>(),
param.Y->data<float>(),
param.Out->mutable_data<float>(TARGET(kXPU)),
pre,
n);
CHECK_EQ(r, 0);
return;
}
int iter = std::accumulate(
x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
int stride = param.Y->numel();
for (int i = 0; i < iter; ++i) {
const float* x_ptr = param.X->data<float>() + i * stride;
const float* y_ptr = param.Y->data<float>();
float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
int r = xdnn::elementwise_sub(ctx.GetRawContext(), /* context */
x_ptr, /* x */
y_ptr, /* y */
o_ptr, /* z */
stride /* len */);
if (pre != 1 || post != 1) {
XPUScratchPadGuard y_broadcast_xpu_guard_ =
TargetWrapperXPU::MallocScratchPad(len * sizeof(float),
false /* use_l3 */);
y_broadcast = reinterpret_cast<float*>(y_broadcast_xpu_guard_->addr_);
int r = xdnn::broadcast_ew(ctx.GetRawContext(),
param.Y->data<float>(),
y_broadcast,
pre,
n,
post,
xdnn::ElementwiseOp::ASSIGN);
CHECK_EQ(r, 0);
r = xdnn::elementwise_mul(
ctx.GetRawContext(), /* context */
param.X->data<float>(), /* x */
y_broadcast, /* y */
param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
len);
CHECK_EQ(r, 0);
return;
}
int r = xdnn::elementwise_mul(
ctx.GetRawContext(), /* context */
param.X->data<float>(), /* x */
param.Y->data<float>(), /* y */
param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
len);
CHECK_EQ(r, 0);
}
void ElementwiseDivCompute::Run() {
void ElementwiseSubCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.X->dims().data();
auto& x_dims = param.X->dims();
auto& y_dims = param.Y->dims();
int axis = param.axis;
if (param.axis == -1) {
axis = x_dims.size() - y_dims.size();
}
int iter = std::accumulate(
x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
int stride = param.Y->numel();
for (int i = 0; i < iter; ++i) {
const float* x_ptr = param.X->data<float>() + i * stride;
const float* y_ptr = param.Y->data<float>();
float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
int r = xdnn::elementwise_div(ctx.GetRawContext(), /* context */
x_ptr, /* x */
y_ptr, /* y */
o_ptr, /* z */
stride /* len */);
auto y_dims_untrimed = y_dims;
axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
auto y_dims_after_trailing = TrimTrailingSingularDims(y_dims_untrimed);
axis = (y_dims_after_trailing.size() == 0) ? x_dims.size() : axis;
int pre, n, post;
GetMidDims(x_dims, y_dims_after_trailing, axis, &pre, &n, &post);
int len = pre * n * post;
float* y_broadcast = nullptr;
if (len != param.Y->numel()) {
XPUScratchPadGuard y_broadcast_xpu_guard_ =
TargetWrapperXPU::MallocScratchPad(len * sizeof(float),
false /* use_l3 */);
y_broadcast = reinterpret_cast<float*>(y_broadcast_xpu_guard_->addr_);
int r = xdnn::broadcast_ew(ctx.GetRawContext(),
param.Y->data<float>(),
y_broadcast,
pre,
n,
post,
xdnn::ElementwiseOp::ASSIGN);
CHECK_EQ(r, 0);
r = xdnn::elementwise_sub(
ctx.GetRawContext(), /* context */
param.X->data<float>(), /* x */
y_broadcast, /* y */
param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
len);
CHECK_EQ(r, 0);
return;
}
int r = xdnn::elementwise_sub(
ctx.GetRawContext(), /* context */
param.X->data<float>(), /* x */
param.Y->data<float>(), /* y */
param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
len);
CHECK_EQ(r, 0);
}
void ElementwiseMulCompute::Run() {
void ElementwiseDivCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.X->dims().data();
auto& x_dims = param.X->dims();
auto& y_dims = param.Y->dims();
int axis = param.axis;
if (param.axis == -1) {
axis = x_dims.size() - y_dims.size();
}
int iter = std::accumulate(
x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
int stride = param.Y->numel();
for (int i = 0; i < iter; ++i) {
const float* x_ptr = param.X->data<float>() + i * stride;
const float* y_ptr = param.Y->data<float>();
float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
int r = xdnn::elementwise_mul(ctx.GetRawContext(), /* context */
x_ptr, /* x */
y_ptr, /* y */
o_ptr, /* z */
stride /* len */);
auto y_dims_untrimed = y_dims;
axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
auto y_dims_after_trailing = TrimTrailingSingularDims(y_dims_untrimed);
axis = (y_dims_after_trailing.size() == 0) ? x_dims.size() : axis;
int pre, n, post;
GetMidDims(x_dims, y_dims_after_trailing, axis, &pre, &n, &post);
int len = pre * n * post;
float* y_broadcast = nullptr;
if (len != param.Y->numel()) {
XPUScratchPadGuard y_broadcast_xpu_guard_ =
TargetWrapperXPU::MallocScratchPad(len * sizeof(float),
false /* use_l3 */);
y_broadcast = reinterpret_cast<float*>(y_broadcast_xpu_guard_->addr_);
int r = xdnn::broadcast_ew(ctx.GetRawContext(),
param.Y->data<float>(),
y_broadcast,
pre,
n,
post,
xdnn::ElementwiseOp::ASSIGN);
CHECK_EQ(r, 0);
r = xdnn::elementwise_div(
ctx.GetRawContext(), /* context */
param.X->data<float>(), /* x */
y_broadcast, /* y */
param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
len);
CHECK_EQ(r, 0);
return;
}
int r = xdnn::elementwise_div(
ctx.GetRawContext(), /* context */
param.X->data<float>(), /* x */
param.Y->data<float>(), /* y */
param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
len);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
......@@ -145,33 +336,33 @@ REGISTER_LITE_KERNEL(elementwise_add,
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_sub,
REGISTER_LITE_KERNEL(elementwise_mul,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::ElementwiseSubCompute,
paddle::lite::kernels::xpu::ElementwiseMulCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_div,
REGISTER_LITE_KERNEL(elementwise_sub,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::ElementwiseDivCompute,
paddle::lite::kernels::xpu::ElementwiseSubCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_mul,
REGISTER_LITE_KERNEL(elementwise_div,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::ElementwiseMulCompute,
paddle::lite::kernels::xpu::ElementwiseDivCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
......
......@@ -138,7 +138,7 @@ bool XPUConv2dOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
param_.dilations = std::make_shared<std::vector<int>>(dilations);
param_.groups = op_desc.GetAttr<int>("groups");
if (op_desc.HasAttr("act_type")) {
param_.act_type = op_desc.GetAttr<int>("act_type");
param_.act_type = op_desc.GetAttr<std::string>("act_type");
}
if (op_desc.HasAttr("filter_type")) {
......
......@@ -1836,7 +1836,7 @@ struct XPUConv2dParam : ParamBase {
lite::Tensor* OutputMax{nullptr};
int groups{1};
int act_type{-1};
std::string act_type{""};
std::string filter_type{""};
std::vector<int> strides;
std::shared_ptr<std::vector<int>> paddings;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册