From bd0b38e6718c2d9790a2764a9fbba8fa7ca444f9 Mon Sep 17 00:00:00 2001 From: Adam <38704900+grygielski@users.noreply.github.com> Date: Tue, 23 Jun 2020 15:26:54 +0200 Subject: [PATCH] Refactor of conv fp32 oneDNN operator (#25137) * Refactor of conv fp32 oneDNN operator test=develop * Formatting fix test=develop * Return Enforces test=develop * GetWeights improvements test=develop --- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 624 ++++++++++-------- paddle/fluid/platform/mkldnn_reuse.h | 67 ++ 2 files changed, 411 insertions(+), 280 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index a01bf8f9b9c..ac6ddebb813 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -26,42 +26,24 @@ using mkldnn::memory; using mkldnn::primitive; using mkldnn::reorder; using mkldnn::stream; -using platform::to_void_cast; using platform::GetMKLDNNFormat; +using platform::to_void_cast; inline void GetWeightsTz(std::vector& weights_tz, // NOLINT - int groups, bool is_conv3d) { + const int groups) { if (groups > 1) { - if (is_conv3d) { - int output = weights_tz[0]; - int input = weights_tz[1]; - int dimension = weights_tz[2]; - int height = weights_tz[3]; - int width = weights_tz[4]; - weights_tz.resize(6); - weights_tz[0] = groups; - weights_tz[1] = output / groups; - weights_tz[2] = input; - weights_tz[3] = dimension; - weights_tz[4] = height; - weights_tz[5] = width; - } else { - int output = weights_tz[0]; - int input = weights_tz[1]; - int height = weights_tz[2]; - int width = weights_tz[3]; - weights_tz.resize(5); - weights_tz[0] = groups; - weights_tz[1] = output / groups; - weights_tz[2] = input; - weights_tz[3] = height; - weights_tz[4] = width; - } + // if (is_conv3d) [o, i, d, h, w]->[g, o/g, i, d, h, w] + // else [o, i, h, w] -> [g, o/g, i, h, w] + weights_tz.push_back(0); + std::rotate(weights_tz.begin(), weights_tz.end() - 1, weights_tz.end()); + weights_tz[0] = groups; + weights_tz[1] = weights_tz[1] / groups; } } -inline MKLDNNMemoryFormat GetWeightsFormat(MKLDNNMemoryFormat format, - int groups, bool is_conv3d) { +inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format, + const int groups, + const bool is_conv3d) { if (is_conv3d) { return (groups == 1) ? format : MKLDNNMemoryFormat::goidhw; } else { @@ -90,304 +72,386 @@ static mkldnn::memory::data_type GetDstType(bool is_int8, return dst_dt; } -template -class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { +template +class ConvMKLDNNHandlerT + : public platform::MKLDNNHandlerT { public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet( - "Operator DNNL Conv must use CPUPlace")); - bool is_INT8 = - std::is_same::value || std::is_same::value; - if (!is_INT8) { - ComputeFP32(ctx); - } else { - std::string fuse_activation = ctx.Attr("fuse_activation"); - bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); - bool force_fp32_output = ctx.Attr("force_fp32_output"); - auto residual_param = ctx.Input("ResidualData"); - auto dst_dt = GetDstType(true, force_fp32_output, fuse_activation, - fuse_residual_conn, residual_param); - if (dst_dt == mkldnn::memory::data_type::f32) { - ComputeINT8(ctx); - } else if (dst_dt == mkldnn::memory::data_type::u8) { - ComputeINT8(ctx); - } else if (dst_dt == mkldnn::memory::data_type::s8) { - ComputeINT8(ctx); - } - } - } + ConvMKLDNNHandlerT(const paddle::framework::ExecutionContext& ctx, + const platform::MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, + platform::Place cpu_place, const Tensor* input, + const Tensor* filter, const Tensor* bias, Tensor* output, + const std::string& unique_name) + : platform::MKLDNNHandlerT( + dev_ctx, mkldnn_engine, cpu_place, + platform::CreateKey(framework::vectorize(input->dims()), + unique_name)) { + if (!this->isCached()) { + PADDLE_ENFORCE_EQ( + input->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "The input tensor's layout should be %d, but got %d.", + DataLayout::kMKLDNN, input->layout())); + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for Input tensor")); - void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const { - const bool is_test = ctx.Attr("is_test"); + PADDLE_ENFORCE_EQ( + filter->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "The Filter tensor's layout should be %d, but got %d.", + DataLayout::kMKLDNN, filter->layout())); + PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for Filter tensor")); - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); + PADDLE_ENFORCE_GE( + input->dims().size(), 4, + platform::errors::InvalidArgument( + "Input must be with 4 or 5 dimensions, i.e. NCHW or " + "NCDHW, but got dimension = %d .", + input->dims().size())); + PADDLE_ENFORCE_LE( + input->dims().size(), 5, + platform::errors::InvalidArgument( + "Input must be with 4 or 5 dimensions, i.e. NCHW or " + "NCDHW, but got dimension = %d .", + input->dims().size())); - auto* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; - auto* output = ctx.Output("Output"); + PADDLE_ENFORCE_GE( + filter->dims().size(), 4, + platform::errors::InvalidArgument( + "Filter must be with 4 or 5 dimensions, i.e. OIHW or " + "OIDHW, but got dimension = %d .", + filter->dims().size())); + PADDLE_ENFORCE_LE( + filter->dims().size(), 5, + platform::errors::InvalidArgument( + "Filter must be with 4 or 5 dimensions, i.e. OIHW or " + "OIDHW, but got dimension = %d .", + filter->dims().size())); - PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, input->layout())); - PADDLE_ENFORCE_NE( - input->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for Input tensor")); + if (bias) { + PADDLE_ENFORCE_EQ( + bias->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "The Bias tensor's layout should be %d, but got %d.", + DataLayout::kMKLDNN, bias->layout())); + PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Got wrong format for Bias tensor.")); - PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The Filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); - PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for Filter tensor")); + PADDLE_ENFORCE_EQ(bias->dims().size(), 1, + platform::errors::InvalidArgument( + "Bias must only have 1 dimension, " + "i.e. X, but got dimension = %d .", + bias->dims().size())); + } - PADDLE_ENFORCE_GE(input->dims().size(), 4, - platform::errors::InvalidArgument( - "Input must be with 4 or 5 dimensions, i.e. NCHW or " - "NCDHW, but got dimension = %d .", - input->dims().size())); - PADDLE_ENFORCE_LE(input->dims().size(), 5, - platform::errors::InvalidArgument( - "Input must be with 4 or 5 dimensions, i.e. NCHW or " - "NCDHW, but got dimension = %d .", - input->dims().size())); + const std::string fuse_activation = + ctx.Attr("fuse_activation"); + const float fuse_alpha = ctx.Attr("fuse_alpha"); + const float fuse_beta = ctx.Attr("fuse_beta"); + const bool fuse_residual_conn = + ctx.Attr("fuse_residual_connection"); + const int groups = ctx.Attr("groups"); + const std::string padding_algorithm = + ctx.Attr("padding_algorithm"); - PADDLE_ENFORCE_GE(filter->dims().size(), 4, - platform::errors::InvalidArgument( - "Filter must be with 4 or 5 dimensions, i.e. OIHW or " - "OIDHW, but got dimension = %d .", - filter->dims().size())); - PADDLE_ENFORCE_LE(filter->dims().size(), 5, - platform::errors::InvalidArgument( - "Filter must be with 4 or 5 dimensions, i.e. OIHW or " - "OIDHW, but got dimension = %d .", - filter->dims().size())); + const auto input_dims = input->dims(); + const auto data_dims = + framework::slice_ddim(input_dims, 2, input_dims.size()); + const auto filter_dims = filter->dims(); + const auto filter_data_dims = + framework::slice_ddim(filter_dims, 2, filter_dims.size()); - if (bias) { - PADDLE_ENFORCE_EQ( - bias->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The Bias tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, bias->layout())); - PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Bias tensor.")); + const auto ksize = framework::vectorize(filter_data_dims); + const bool is_test = ctx.Attr("is_test"); - PADDLE_ENFORCE_EQ( - bias->dims().size(), 1, - platform::errors::InvalidArgument("Bias must only have 1 dimension, " - "i.e. X, but got dimension = %d .", - bias->dims().size())); - } + auto strides_temp = ctx.Attr>("strides"); + std::vector strides(begin(strides_temp), end(strides_temp)); - std::vector strides_temp = ctx.Attr>("strides"); - std::vector strides(begin(strides_temp), end(strides_temp)); + auto paddings_temp = ctx.Attr>("paddings"); + std::vector paddings(begin(paddings_temp), end(paddings_temp)); - std::vector paddings_temp = ctx.Attr>("paddings"); - std::vector paddings(begin(paddings_temp), end(paddings_temp)); + auto dilations_temp = ctx.Attr>("dilations"); + std::vector dilations(begin(dilations_temp), + end(dilations_temp)); - std::vector dilations_temp = ctx.Attr>("dilations"); - std::vector dilations(begin(dilations_temp), end(dilations_temp)); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + data_dims, strides, ksize); + const bool is_conv3d = strides.size() == 3U; - std::string fuse_activation = ctx.Attr("fuse_activation"); - float fuse_alpha = ctx.Attr("fuse_alpha"); - float fuse_beta = ctx.Attr("fuse_beta"); - bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); - int groups = ctx.Attr("groups"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - bool is_conv3d = strides.size() == 3U; + PADDLE_ENFORCE_EQ( + is_conv3d + ? dilations.size() == 3 && dilations[0] == 1 && + dilations[1] == 1 && dilations[2] == 1 + : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, + true, platform::errors::Unimplemented( + "Dilation in oneDNN convolution is not implemented yet")); - auto input_dims = input->dims(); - auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size()); - auto filter_dims = filter->dims(); - auto filter_data_dims = - framework::slice_ddim(filter_dims, 2, filter_dims.size()); + const auto src_tz = paddle::framework::vectorize(input->dims()); - auto ksize = framework::vectorize(filter_data_dims); + auto weights_tz = paddle::framework::vectorize(filter->dims()); + GetWeightsTz(weights_tz, groups); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - data_dims, strides, ksize); + const auto dst_tz = paddle::framework::vectorize(output->dims()); - std::vector pipeline; + const mkldnn::memory::dims stride_dims = strides; + const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); - PADDLE_ENFORCE( - is_conv3d - ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 && - dilations[2] == 1 - : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, - "dilation in convolution is not implemented yet"); + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + // TODO(jczaja): This is workaround to make grad op UT's numerical + // gradient computation proper as this op is called directly without + // fetch op following it , so numercial grad is computed (in python) + // using block formats which will give wrong results + const std::string data_format = ctx.Attr("data_format"); + auto chosen_memory_format = + is_test ? MKLDNNMemoryFormat::any + : platform::data_format_to_memory_format(data_format); + + // Check the format for user's special output + if (chosen_memory_format != MKLDNNMemoryFormat::any) { + if (is_conv3d) { + chosen_memory_format = platform::MKLDNNFormatForSize( + src_tz.size(), chosen_memory_format); + } + } - const T* input_data = input->data(); - const T* filter_data = filter->data(); + const auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + const auto weights_md = + platform::MKLDNNMemDesc(weights_tz, platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); + const auto dst_md = platform::MKLDNNMemDesc( + dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - auto src_tz = paddle::framework::vectorize(input->dims()); - auto weights_tz = paddle::framework::vectorize(filter->dims()); - int g = std::max(groups, 1); + const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training; - GetWeightsTz(weights_tz, g, is_conv3d); + const mkldnn::primitive_attr conv_attr = CreatePostOps( + fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn); - auto dst_tz = paddle::framework::vectorize(output->dims()); + if (bias) { + auto bias_tz = framework::vectorize(bias->dims()); + auto bias_md = platform::MKLDNNMemDesc( + bias_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); + + this->AcquireForwardPrimitiveDescriptor( + conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct, + src_md, weights_md, bias_md, dst_md, stride_dims, + mkldnn_paddings[0], mkldnn_paddings[1]); + } else { + this->AcquireForwardPrimitiveDescriptor( + conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct, + src_md, weights_md, dst_md, stride_dims, mkldnn_paddings[0], + mkldnn_paddings[1]); + } + } + } - // Get unique name for storing MKLDNN primitives - const std::string key = platform::CreateKey( - src_tz, ctx.InputName("Input") + ctx.InputName("Filter")); + mkldnn::primitive_attr CreatePostOps( + std::string fuse_activation, float fuse_alpha, float fuse_beta, + bool fuse_residual_conn, const std::vector output_shift_scale = {}, + float sum_scale = 1.0f) { + mkldnn::primitive_attr conv_attr; + mkldnn::post_ops post_operations; + if (output_shift_scale.size() > 0) { + int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; + conv_attr.set_output_scales(mask, output_shift_scale); + } - auto src_format = input->format(); - MKLDNNMemoryFormat weights_format = - GetWeightsFormat(filter->format(), g, is_conv3d); + // Fusion with Elementwise layer relies on adding a sum post-operation with + // the scale parameter. It is assumed that when fuse_residual_connection is + // true, the output tensor contains the data coming from residual + // connection. The result of this post_op is: + // Output = scale * Output + Conv_Out. + if (fuse_residual_conn) { + post_operations.append_sum(sum_scale); + } + // Fusion with ReLU layer is executed through the PostOps feature. Create a + // PostOps object and configure it to execute an eltwise relu operation. + if (fuse_activation == "relu" || fuse_activation == "leaky_relu") { + constexpr float scale = 1.0f; + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, + fuse_alpha, fuse_beta); + } else if (fuse_activation == "relu6") { + constexpr float scale = 1.0f; + post_operations.append_eltwise(scale, + mkldnn::algorithm::eltwise_bounded_relu, + fuse_alpha, fuse_beta); + } else if (fuse_activation == "swish") { + constexpr float scale = 1.0f; + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_swish, + fuse_alpha, fuse_beta); + } + conv_attr.set_post_ops(post_operations); + return conv_attr; + } + std::shared_ptr AcquireSrcMemoryWithReorder( + const framework::Tensor* input) { + const T* input_data = input->data(); auto user_src_md = platform::MKLDNNMemDesc( - {src_tz}, platform::MKLDNNGetDataType(), src_format); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), weights_format); + framework::vectorize(input->dims()), platform::MKLDNNGetDataType(), + input->format()); - /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ - // TODO(jczaja): This is workaround to make grad op UT's numerical - // gradient computation proper as this op is called directly without - // fetch op following it , so numercial grad is computed (in python) - // using block formats which will give wrong results - std::string data_format = ctx.Attr("data_format"); - auto chosen_memory_format = - is_test ? MKLDNNMemoryFormat::any - : platform::data_format_to_memory_format(data_format); + return this->AcquireMemoryWithReorder( + user_src_md, this->fwd_pd_->src_desc(), to_void_cast(input_data), + "@src_mem_p"); + } - weights_format = MKLDNNMemoryFormat::any; - // Check the format for user's special output - if (chosen_memory_format != MKLDNNMemoryFormat::any) { - if (is_conv3d) { - chosen_memory_format = - platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); - } + std::shared_ptr AcquireWeightsMemoryWithReorder( + const framework::Tensor* filter, const int groups, const bool is_conv3d, + const bool is_test) { + // This is workaround to make execution faster, delete + // if statement after including md inside Tensor + auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target"); + if (is_test && weights_mem_p) { + return weights_mem_p; + } else { + const T* filter_data = filter->data(); + auto weights_tz = framework::vectorize(filter->dims()); + GetWeightsTz(weights_tz, groups); + + auto user_src_md = platform::MKLDNNMemDesc( + weights_tz, platform::MKLDNNGetDataType(), + GetWeightsFormat(filter->format(), groups, is_conv3d)); + + return this->AcquireMemoryWithReorder( + user_src_md, this->fwd_pd_->weights_desc(), + to_void_cast(filter_data), "@weights_mem_p", is_test); } + } - auto src_md = platform::MKLDNNMemDesc( - src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - auto weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), weights_format); - std::vector bias_tz; - auto dst_md = platform::MKLDNNMemDesc( - dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + std::shared_ptr AcquireBiasMemoryWithReorder( + const framework::Tensor* bias, const bool is_test) { + const T* bias_data = bias->data(); + auto user_bias_md = platform::MKLDNNMemDesc( + framework::vectorize(bias->dims()), platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::x); - platform::ConvMKLDNNHandler handler(dev_ctx, mkldnn_engine, key); + return this->AcquireMemoryWithReorder( + user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast(bias_data), + "@bias_mem_p", is_test); + } - // create a conv primitive descriptor and save it for usage in backward - std::shared_ptr conv_pd; - auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training; - if (bias) { - bias_tz = paddle::framework::vectorize(bias->dims()); - auto bias_md = platform::MKLDNNMemDesc( - bias_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); - conv_pd = handler.AcquireConvolutionPrimitiveDescriptor( - src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine, - fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn, - fwd_prop_kind); + std::shared_ptr AcquireResidualMemory( + const framework::Tensor* residual_param) { + const T* residual_data = residual_param->data(); + auto user_residual_md = platform::MKLDNNMemDesc( + framework::vectorize(residual_param->dims()), + framework::ToMKLDNNDataType(residual_param->type()), + residual_param->format()); + + return this->AcquireMemoryFromPrimitive(user_residual_md, + to_void_cast(residual_data), + "@user_residual_data_mem_p"); + } + + std::shared_ptr AcquireDstMemoryWithResidual( + framework::Tensor* output, const framework::Tensor* residual_param) { + std::shared_ptr dst_memory_p; + if (residual_param->format() != + platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) { + auto residual_memory_p = this->AcquireResidualMemory(residual_param); + dst_memory_p = this->AcquireDstMemory(output); + this->AcquireReorder(residual_memory_p, dst_memory_p, "@residual_dst"); } else { - conv_pd = handler.AcquireConvolutionPrimitiveDescriptor( - src_md, weights_md, boost::none, dst_md, strides, paddings, - mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, fwd_prop_kind); + // Changing ShareDataWith to TensorCopy results in performance drop + // on ResNet architectures + // (https://github.com/PaddlePaddle/Paddle/issues/22964) + output->ShareDataWith(*residual_param); + dst_memory_p = this->AcquireDstMemory(output); } + return dst_memory_p; + } +}; - // create mkldnn memory from input tensors (data/weights) - auto user_src_memory_p = - handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); - auto user_weights_memory_p = handler.AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); +template +class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL Conv must use CPUPlace")); + bool is_INT8 = + std::is_same::value || std::is_same::value; + if (!is_INT8) { + ComputeFP32(ctx); + } else { + std::string fuse_activation = ctx.Attr("fuse_activation"); + bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + bool force_fp32_output = ctx.Attr("force_fp32_output"); + auto residual_param = ctx.Input("ResidualData"); + auto dst_dt = GetDstType(true, force_fp32_output, fuse_activation, + fuse_residual_conn, residual_param); + if (dst_dt == mkldnn::memory::data_type::f32) { + ComputeINT8(ctx); + } else if (dst_dt == mkldnn::memory::data_type::u8) { + ComputeINT8(ctx); + } else if (dst_dt == mkldnn::memory::data_type::s8) { + ComputeINT8(ctx); + } + } + } - // create reorder primitive if the input format is not the preferred one - auto src_memory_p = - handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); - auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline, is_test); + void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const { + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); - std::shared_ptr dst_memory_p, user_residual_memory_p; + const bool is_test = ctx.Attr("is_test"); + const bool is_conv3d = ctx.Attr>("strides").size() == 3U; + const bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - auto residual_param_data = residual_param->data(); + const auto* input = ctx.Input("Input"); + const auto* filter = ctx.Input("Filter"); + const auto* bias = + ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; + auto* output = ctx.Output("Output"); - PADDLE_ENFORCE_NE( - residual_param_data, nullptr, - platform::errors::InvalidArgument( - "Provide data if you want MKLDNN conv+elementwise_add fusion")); - PADDLE_ENFORCE_EQ( - output->dims(), residual_param->dims(), - platform::errors::InvalidArgument( - "Output and elementwise parameter need to have the " - "same dimension sizes, " - "but got output's dimension = %d and residual param's dimension " - "= %d .", - output->dims().size(), residual_param->dims().size())); - - if (residual_param->format() != handler.GetDstFormat()) { - auto output_data = - output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); - auto residual_data_tz = - paddle::framework::vectorize(residual_param->dims()); - auto residual_data_type = - paddle::framework::ToMKLDNNDataType(residual_param->type()); + ConvMKLDNNHandlerT handler( + ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias, + output, ctx.InputName("Input") + ctx.InputName("Filter")); - auto user_residual_md = platform::MKLDNNMemDesc( - residual_data_tz, residual_data_type, residual_param->format()); - user_residual_memory_p = handler.AcquireResidualDataMemory( - user_residual_md, to_void_cast(residual_param_data)); + auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); - dst_memory_p = handler.AcquireDstMemoryFromResidualDataMemory( - user_residual_memory_p, to_void_cast(output_data), pipeline); - } else { - // Changing ShareDataWith to TensorCopy results in performance drop - // on ResNet architectures - // (https://github.com/PaddlePaddle/Paddle/issues/22964) - output->ShareDataWith(*residual_param); - auto output_data = output->mutable_data(ctx.GetPlace()); - dst_memory_p = - handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); - } - } else { - auto output_data = - output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); + auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( + filter, ctx.Attr("groups"), is_conv3d, is_test); + + std::shared_ptr dst_memory_p; + if (fuse_residual_conn) { + auto* residual_param = ctx.Input("ResidualData"); dst_memory_p = - handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + handler.AcquireDstMemoryWithResidual(output, residual_param); + } else { + dst_memory_p = handler.AcquireDstMemory(output); } - auto conv_p = handler.AcquireConvolution(); + auto conv_p = handler.AcquireForwardPrimitive(); - mkldnn::stream astream(mkldnn_engine); - if (bias) { - const T* bias_data = bias->data(); - auto user_bias_md = platform::MKLDNNMemDesc( - {bias_tz}, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); - auto user_bias_memory_p = - handler.AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); - - auto bias_memory_p = - handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline); + std::unordered_map args = { + {MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}; - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_BIAS, *bias_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - - } else { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); + if (bias) { + auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias, is_test); + args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } + + mkldnn::stream astream(mkldnn_engine); + conv_p->execute(astream, args); astream.wait(); output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory_p)); } + template void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { const bool is_test = ctx.Attr("is_test"); @@ -552,7 +616,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto weights_tz = paddle::framework::vectorize(filter->dims()); int g = std::max(groups, 1); - GetWeightsTz(weights_tz, g, is_conv3d); + GetWeightsTz(weights_tz, g); auto dst_tz = paddle::framework::vectorize(output->dims()); PADDLE_ENFORCE_EQ( @@ -598,9 +662,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { ((g) == 1) ? MKLDNNMemoryFormat::oihw : MKLDNNMemoryFormat::goihw); /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ auto chosen_memory_format = MKLDNNMemoryFormat::any; std::vector bias_tz; @@ -866,7 +930,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto weights_tz = paddle::framework::vectorize(filter->dims()); int g = std::max(groups, 1); - GetWeightsTz(weights_tz, g, is_conv3d); + GetWeightsTz(weights_tz, g); auto dst_tz = paddle::framework::vectorize(output_grad->dims()); auto src_format = input->format(); @@ -879,7 +943,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { const std::string key = platform::CreateKey( src_tz, ctx.InputName("Input") + ctx.InputName("Filter")); - const std::string key_conv_pd = key + "@conv_pd"; + const std::string key_conv_pd = key + "@forward_pd"; std::vector pipeline; // Create user memory descriptors diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index f76df5ca26a..2d36ef2ce6a 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -210,6 +210,73 @@ class MKLDNNHandlerT { return mem_p; } + void AcquireReorder(const std::shared_ptr& user_memory_p, + const std::shared_ptr& target_memory_p, + const std::string& suffix) { + const auto key_reorder_p = key_ + suffix + "reorder_p"; + + auto reorder_p = std::static_pointer_cast( + dev_ctx_.GetBlob(key_reorder_p)); + + if (reorder_p == nullptr) { + reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + dev_ctx_.SetBlob(key_reorder_p, reorder_p); + } + + mkldnn::stream astream(engine_); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } + + std::shared_ptr AcquireMemoryWithReorder( + const mkldnn::memory::desc& user_md, + const mkldnn::memory::desc& target_md, void* ptr, + const std::string& suffix, bool is_persistent = false) { + const auto target_key = key_ + suffix + "_target"; + const auto key_reorder_p = key_ + suffix + "reorder_p"; + const auto user_key = key_ + suffix + "_user"; + + auto target_memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(target_key)); + + if (target_memory_p == nullptr) { + auto user_memory_p = + std::make_shared(user_md, engine_, ptr); + if (user_md != target_md) { + target_memory_p = std::make_shared(target_md, engine_); + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + dev_ctx_.SetBlob(key_reorder_p, reorder_p); + + mkldnn::stream astream(engine_); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } + dev_ctx_.SetBlob(user_key, user_memory_p); + dev_ctx_.SetBlob(target_key, target_memory_p); + } else if (!is_persistent) { + mkldnn::stream astream(engine_); + + auto user_memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(user_key)); + user_memory_p->set_data_handle(ptr); + + auto reorder_p = std::static_pointer_cast( + dev_ctx_.GetBlob(key_reorder_p)); + if (reorder_p != nullptr) { + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } + } + return target_memory_p; + } + std::shared_ptr AcquireMemory(const std::string& suffix) { const auto local_key = key_ + suffix; return std::static_pointer_cast( -- GitLab