提交 728d6d1a 编写于 作者: Y Yanzhan Yang 提交者: StarryRain

fuse conv add batch relu when using faster depthwise conv (#1749)

上级 83fa6f82
......@@ -61,16 +61,61 @@ bool ConvAddBNReluKernel<CPU, float>::Init(
param->SetNewBias(new_bias);
InitBaseConvKernel(param);
// try to use faster depthwise conv
switch (param->ExecMode()) {
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
const std::vector<int> &paddings = param->Paddings();
const std::vector<int> &strides = param->Strides();
if (paddings.size() == 2 && paddings[0] == paddings[1] &&
strides.size() == 2 && strides[0] == strides[1]) {
int pad = paddings[0];
int stride = strides[0];
const int hin = param->Input()->dims()[2];
if (pad == 0 && hin > 2) {
could_use_faster_depthwise_conv_ = true;
} else if (pad == 1) {
could_use_faster_depthwise_conv_ = true;
}
}
break;
}
if (could_use_faster_depthwise_conv_) {
auto filter_data = param->Filter()->data<float>();
auto filter_dim = param->Filter()->dims();
int len = 1;
for (int i = 0; i < filter_dim.size(); i++) {
len *= filter_dim[i];
}
int batch = filter_dim[0];
int step = len / batch;
for (int i = 0; i < batch; i++) {
for (int k = 0; k < step; k++) {
filter_data[i * step + k] =
filter_data[i * step + k] * new_scale_ptr[i];
}
}
}
return true;
}
template <>
void ConvAddBNReluKernel<CPU, float>::Compute(
const FusionConvAddBNReluParam<CPU> &param) {
bool fusion_has_been_computed = false;
switch (param.ExecMode()) {
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
if (could_use_faster_depthwise_conv_) {
FasterDepthwiseConv3x3_bias_relu(param, param.NewBias()->data<float>(),
true);
fusion_has_been_computed = true;
} else {
DepthwiseConv3x3<float, float>(param);
}
break;
case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
DepthwiseConv5x5<float, float>(param);
......@@ -89,8 +134,10 @@ void ConvAddBNReluKernel<CPU, float>::Compute(
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
if (!fusion_has_been_computed) {
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
}
}
template class ConvAddBNReluKernel<CPU, float>;
......
......@@ -212,8 +212,8 @@ void DepthwiseConv3x3(const ConvParam<CPU> &param) {
}
}
template <>
void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param) {
void FasterDepthwiseConv3x3_bias_relu(const ConvParam<CPU> &param,
const float *bias, bool flag_relu) {
const Tensor *input = param.Input();
const Tensor *filter = param.Filter();
const std::vector<int> &paddings = param.Paddings();
......@@ -222,14 +222,11 @@ void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param) {
Tensor *output = param.Output();
output->mutable_data<float>();
if (paddings.size() == 2 && paddings[0] == paddings[1] &&
strides.size() == 2 && strides[0] == strides[1]) {
int pad = paddings[0];
int stride = strides[0];
const float *din = input->data<float>();
float *dout = output->mutable_data<float>();
const float *weights = filter->data<float>();
const float *bias = nullptr;
const int num = input->dims()[0];
const int chin = input->dims()[1];
const int hin = input->dims()[2];
......@@ -237,37 +234,15 @@ void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param) {
const int chout = output->dims()[1];
const int hout = output->dims()[2];
const int wout = output->dims()[3];
bool flag_relu = false;
bool flag_bias = bias != nullptr;
if (pad == 0 && hin > 2) {
math::depthwise::conv_depthwise_3x3p0(din, dout, num, chout, hout, wout,
chin, hin, win, weights, bias,
stride, flag_bias, flag_relu);
chin, hin, win, weights, bias, stride,
flag_bias, flag_relu);
} else if (pad == 1) {
math::depthwise::conv_depthwise_3x3p1(din, dout, num, chout, hout, wout,
chin, hin, win, weights, bias,
stride, flag_bias, flag_relu);
} else {
GemmConv<float, float>(param);
}
} else {
if (strides[0] == 1) {
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1);
math::DepthwiseConv3x3S1<float, float>(in_batch, *filter, paddings,
&out_batch);
}
} else if (strides[0] == 2) {
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1);
math::DepthwiseConv3x3S2<float, float>(in_batch, *filter, paddings,
&out_batch);
}
} else {
GemmConv<float, float>(param);
}
chin, hin, win, weights, bias, stride,
flag_bias, flag_relu);
}
}
......
......@@ -44,6 +44,9 @@ void DepthwiseConv5x5(const ConvParam<CPU> &param);
template <typename Itype, typename Otype>
void SlidingwindowConv3x3(const ConvParam<CPU> &param);
void FasterDepthwiseConv3x3_bias_relu(const ConvParam<CPU> &param,
const float *bias, bool flag_relu);
} // namespace operators
} // namespace paddle_mobile
......
......@@ -36,6 +36,9 @@ class ConvAddBNReluKernel
public:
void Compute(const FusionConvAddBNReluParam<DeviceType> &param);
bool Init(FusionConvAddBNReluParam<DeviceType> *param);
private:
bool could_use_faster_depthwise_conv_ = false;
};
} // namespace operators
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册