提交 497bf326 编写于 作者: Y Yanzhan Yang 提交者: StarryRain

fuse conv add batch relu when using faster depthwise conv (#1749)

上级 b42f3d49
...@@ -61,16 +61,61 @@ bool ConvAddBNReluKernel<CPU, float>::Init( ...@@ -61,16 +61,61 @@ bool ConvAddBNReluKernel<CPU, float>::Init(
param->SetNewBias(new_bias); param->SetNewBias(new_bias);
InitBaseConvKernel(param); InitBaseConvKernel(param);
// try to use faster depthwise conv
switch (param->ExecMode()) {
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
const std::vector<int> &paddings = param->Paddings();
const std::vector<int> &strides = param->Strides();
if (paddings.size() == 2 && paddings[0] == paddings[1] &&
strides.size() == 2 && strides[0] == strides[1]) {
int pad = paddings[0];
int stride = strides[0];
const int hin = param->Input()->dims()[2];
if (pad == 0 && hin > 2) {
could_use_faster_depthwise_conv_ = true;
} else if (pad == 1) {
could_use_faster_depthwise_conv_ = true;
}
}
break;
}
if (could_use_faster_depthwise_conv_) {
auto filter_data = param->Filter()->data<float>();
auto filter_dim = param->Filter()->dims();
int len = 1;
for (int i = 0; i < filter_dim.size(); i++) {
len *= filter_dim[i];
}
int batch = filter_dim[0];
int step = len / batch;
for (int i = 0; i < batch; i++) {
for (int k = 0; k < step; k++) {
filter_data[i * step + k] =
filter_data[i * step + k] * new_scale_ptr[i];
}
}
}
return true; return true;
} }
template <> template <>
void ConvAddBNReluKernel<CPU, float>::Compute( void ConvAddBNReluKernel<CPU, float>::Compute(
const FusionConvAddBNReluParam<CPU> &param) { const FusionConvAddBNReluParam<CPU> &param) {
bool fusion_has_been_computed = false;
switch (param.ExecMode()) { switch (param.ExecMode()) {
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT: case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT: case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
DepthwiseConv3x3<float, float>(param); if (could_use_faster_depthwise_conv_) {
FasterDepthwiseConv3x3_bias_relu(param, param.NewBias()->data<float>(),
true);
fusion_has_been_computed = true;
} else {
DepthwiseConv3x3<float, float>(param);
}
break; break;
case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT: case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
DepthwiseConv5x5<float, float>(param); DepthwiseConv5x5<float, float>(param);
...@@ -89,8 +134,10 @@ void ConvAddBNReluKernel<CPU, float>::Compute( ...@@ -89,8 +134,10 @@ void ConvAddBNReluKernel<CPU, float>::Compute(
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode()); param.ExecMode());
} }
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(), if (!fusion_has_been_computed) {
param.NewBias(), param.Output()); math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
}
} }
template class ConvAddBNReluKernel<CPU, float>; template class ConvAddBNReluKernel<CPU, float>;
......
...@@ -212,8 +212,8 @@ void DepthwiseConv3x3(const ConvParam<CPU> &param) { ...@@ -212,8 +212,8 @@ void DepthwiseConv3x3(const ConvParam<CPU> &param) {
} }
} }
template <> void FasterDepthwiseConv3x3_bias_relu(const ConvParam<CPU> &param,
void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param) { const float *bias, bool flag_relu) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
const Tensor *filter = param.Filter(); const Tensor *filter = param.Filter();
const std::vector<int> &paddings = param.Paddings(); const std::vector<int> &paddings = param.Paddings();
...@@ -222,52 +222,27 @@ void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param) { ...@@ -222,52 +222,27 @@ void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param) {
Tensor *output = param.Output(); Tensor *output = param.Output();
output->mutable_data<float>(); output->mutable_data<float>();
if (paddings.size() == 2 && paddings[0] == paddings[1] && int pad = paddings[0];
strides.size() == 2 && strides[0] == strides[1]) { int stride = strides[0];
int pad = paddings[0]; const float *din = input->data<float>();
int stride = strides[0]; float *dout = output->mutable_data<float>();
const float *din = input->data<float>(); const float *weights = filter->data<float>();
float *dout = output->mutable_data<float>(); const int num = input->dims()[0];
const float *weights = filter->data<float>(); const int chin = input->dims()[1];
const float *bias = nullptr; const int hin = input->dims()[2];
const int num = input->dims()[0]; const int win = input->dims()[3];
const int chin = input->dims()[1]; const int chout = output->dims()[1];
const int hin = input->dims()[2]; const int hout = output->dims()[2];
const int win = input->dims()[3]; const int wout = output->dims()[3];
const int chout = output->dims()[1]; bool flag_bias = bias != nullptr;
const int hout = output->dims()[2]; if (pad == 0 && hin > 2) {
const int wout = output->dims()[3]; math::depthwise::conv_depthwise_3x3p0(din, dout, num, chout, hout, wout,
bool flag_relu = false; chin, hin, win, weights, bias, stride,
bool flag_bias = bias != nullptr; flag_bias, flag_relu);
if (pad == 0 && hin > 2) { } else if (pad == 1) {
math::depthwise::conv_depthwise_3x3p0(din, dout, num, chout, hout, wout, math::depthwise::conv_depthwise_3x3p1(din, dout, num, chout, hout, wout,
chin, hin, win, weights, bias, chin, hin, win, weights, bias, stride,
stride, flag_bias, flag_relu); flag_bias, flag_relu);
} else if (pad == 1) {
math::depthwise::conv_depthwise_3x3p1(din, dout, num, chout, hout, wout,
chin, hin, win, weights, bias,
stride, flag_bias, flag_relu);
} else {
GemmConv<float, float>(param);
}
} else {
if (strides[0] == 1) {
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1);
math::DepthwiseConv3x3S1<float, float>(in_batch, *filter, paddings,
&out_batch);
}
} else if (strides[0] == 2) {
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1);
math::DepthwiseConv3x3S2<float, float>(in_batch, *filter, paddings,
&out_batch);
}
} else {
GemmConv<float, float>(param);
}
} }
} }
......
...@@ -44,6 +44,9 @@ void DepthwiseConv5x5(const ConvParam<CPU> &param); ...@@ -44,6 +44,9 @@ void DepthwiseConv5x5(const ConvParam<CPU> &param);
template <typename Itype, typename Otype> template <typename Itype, typename Otype>
void SlidingwindowConv3x3(const ConvParam<CPU> &param); void SlidingwindowConv3x3(const ConvParam<CPU> &param);
void FasterDepthwiseConv3x3_bias_relu(const ConvParam<CPU> &param,
const float *bias, bool flag_relu);
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -36,6 +36,9 @@ class ConvAddBNReluKernel ...@@ -36,6 +36,9 @@ class ConvAddBNReluKernel
public: public:
void Compute(const FusionConvAddBNReluParam<DeviceType> &param); void Compute(const FusionConvAddBNReluParam<DeviceType> &param);
bool Init(FusionConvAddBNReluParam<DeviceType> *param); bool Init(FusionConvAddBNReluParam<DeviceType> *param);
private:
bool could_use_faster_depthwise_conv_ = false;
}; };
} // namespace operators } // namespace operators
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册