diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 5385bcdaec3f9e86cf0d7bf33f270e48ae186fba..eae65968285703f5882d910e29bc5d8e1511cba6 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -300,6 +300,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); bool fuse_relu = ctx.Attr("fuse_relu"); + bool fuse_eltwise = ctx.Attr("fuse_eltwise"); int groups = ctx.Attr("groups"); // TODO: add support for dilation @@ -366,12 +367,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias_tz = paddle::framework::vectorize2int(bias->dims()); auto bias_md = platform::MKLDNNMemDesc( bias_tz, platform::MKLDNNGetDataType(), memory::format::x); - conv_pd = - ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides, - paddings, mkldnn_engine, fuse_relu); + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, + strides, paddings, mkldnn_engine, + fuse_relu, fuse_eltwise); } else { - conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, - paddings, mkldnn_engine, fuse_relu); + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine, fuse_relu, fuse_eltwise); } // Save conv_pd/src_memory/weights_memory for backward pass dev_ctx.SetBlob(key_conv_pd, conv_pd); @@ -421,16 +423,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } private: - mkldnn::primitive_attr AddRelu() const { - // Fusion with ReLU layer is executed through the PostOps feature. Create a - // PostOps object and configure it to execute an eltwise relu operation. + mkldnn::primitive_attr CreatePostOps(bool fuse_relu, + bool fuse_eltwise) const { mkldnn::primitive_attr conv_attr; - constexpr float scale = 1.0f; - constexpr float negative_slope = 0.0f; - constexpr float placeholder = 0.0f; mkldnn::post_ops post_operations; - post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, - negative_slope, placeholder); + // Fusion with Elementwise layer relies on adding a sum post-operation with + // the scale parameter. It is assumed that when fuse_eltwise is true, the + // Output tensor contains the data coming from residual connection. The + // result of this post_op is: Output = scale * Output + Conv_Out. + if (fuse_eltwise) { + post_operations.append_sum(1.0f); + } + // Fusion with ReLU layer is executed through the PostOps feature. Create a + // PostOps object and configure it to execute an eltwise relu operation. + if (fuse_relu) { + constexpr float scale = 1.0f; + constexpr float negative_slope = 0.0f; + constexpr float placeholder = 0.0f; + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, + negative_slope, placeholder); + } conv_attr.set_post_ops(post_operations); return conv_attr; } @@ -439,8 +451,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine, - const bool fuse_relu) const { + const mkldnn::engine& engine, const bool fuse_relu, + const bool fuse_eltwise) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -449,10 +461,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr; - if (fuse_relu) { - conv_attr = AddRelu(); - } + mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise); auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( conv_desc, conv_attr, engine); @@ -466,8 +475,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const memory::desc& bias, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine, - const bool fuse_relu) const { + const mkldnn::engine& engine, const bool fuse_relu, + const bool fuse_eltwise) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -476,10 +485,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias, dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr; - if (fuse_relu) { - conv_attr = AddRelu(); - } + mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise); auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( conv_desc, conv_attr, engine); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 41d4fcf6de7c8fcb3cfbb2063b0a2ac1a2356168..8f84bf71a7f77606bed6672f0830e3fc80165a42 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -164,6 +164,11 @@ void Conv2DOpMaker::Make() { .SetDefault(false); AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("fuse_eltwise", + "(bool, default false) Only used in mkldnn kernel. Used " + "whenever convolution output is connected via skip connection " + "to a previous layer.") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index adad2428f7fdc554cf4efd652f52b5c5de0ab527..49ba2cfd55bc881ed753fcefbd41f5b8fd4ebaf7 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -65,8 +65,43 @@ class InferenceTranspiler(object): if use_mkldnn: self._fuse_conv_bias_mkldnn(program) self._fuse_conv_relu_mkldnn(program) + self._fuse_conv_eltwise_mkldnn(program) + self._fuse_conv_relu_mkldnn( + program) # ResNet residual block merging self._fuse_bn_relu_mkldnn(program) + def _fuse_conv_eltwise_mkldnn(self, program): + ''' + Transpile the program fusing elementwise_add into conv for MKLDNN + program. Elementwise add following convolution OP can be fused by adding + 'fuse_eltwise' attribute to convolution OP and replacing its output + Tensor with second parameter of elementwise_add. + The result of fuse is: + - before: + - conv->elementwise_add->any_other_op + - after: + - conv->any_other_op + :param program: program to transpile + :type program: Program + ''' + self.block = program.block(0) + + i = 0 + while i < len(self.block.ops): + current_op = self.block.ops[i] + if current_op.type in ['conv2d']: + next_op = self.block.ops[i + 1] + if next_op.type == 'elementwise_add': + self._fuse_conv_eltwise(current_op, next_op) + self.block._remove_op(i + 1) # Remove elementwise_add + i = i + 1 + self._adjust_input() + self._remove_unused_var() + # TODO(luotao): use clone() method to flush the program.desc in force, + # since some large program.desc will not be flushed immediately. + # And a better solution will be considered later. + program = program.clone() + def _fuse_conv_relu_mkldnn(self, program): ''' Transpile the program by fused relu activation for MKLDNN program. @@ -88,9 +123,9 @@ class InferenceTranspiler(object): if current_op.type in ['conv2d']: next_op = self.block.ops[i + 1] if next_op.type == 'relu': - # modify conv OP to include relu + # modify bnorm OP to include relu current_op.set_attr("fuse_relu", True) - # remove conv OP + # remove relu OP self.block._remove_op(i + 1) i = i + 1 @@ -409,6 +444,20 @@ class InferenceTranspiler(object): outputs={"Output": out_var}, attrs=attrs) + def _fuse_conv_eltwise(self, conv_op, eltwise_op): + ''' + fuse the conv op with elementwise_add + + :param conv_op: convolution operator + :type conv_op: Operator + :param eltwise_op: operator adding data from skip connection + :type eltwise_op: Operator + ''' + + conv_op.set_attr("fuse_eltwise", True) + self.input_map[conv_op.output("Output")[0]] = eltwise_op.input("Y")[0] + self.input_map[eltwise_op.output("Out")[0]] = eltwise_op.input("Y")[0] + def _adjust_input(self): for i in range(len(self.block.ops)): current_op = self.block.ops[i]