From db6c00c4391813c0861927b25c3cf5c119c14ae3 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Fri, 5 Nov 2021 09:51:16 +0100 Subject: [PATCH] Disable pool&conv_transpose&quantize caching (#36695) * - WIP - compilation fix - fix - fixes - fix - fix - fix again - fix - another fix - another compilation fix - fix - fix - fix - lint * - pool2d partially stripped from cache - pool2d partially stripped of caching * - compilation fix * - compilation fix * - Fix to UT of caching * - Enabling test_conv3d_mkldnn * - conv_transpose stripped of cache * - compilation fix * - fix * - fix * - compilation fix * - fix * Reverted disabling caching of conv2d * - compilation fix * - ut reverted --- .../fused/mkldnn/fusion_gru_mkldnn_op.cc | 1 - .../fused/mkldnn/fusion_lstm_mkldnn_op.cc | 1 - .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 4 +- .../mkldnn/conv_transpose_mkldnn_op.cc | 475 ++++++++++-------- .../fluid/operators/mkldnn/pool_mkldnn_op.cc | 393 +++++++-------- .../operators/mkldnn/quantize_mkldnn_op.cc | 105 ++-- paddle/fluid/platform/mkldnn_reuse.h | 19 +- .../unittests/mkldnn/test_conv3d_mkldnn_op.py | 2 + 8 files changed, 496 insertions(+), 504 deletions(-) diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc index 8e0627fc15..e1506e3708 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc @@ -21,7 +21,6 @@ namespace operators { using paddle::framework::LoDTensor; using paddle::framework::Tensor; using paddle::platform::CPUDeviceContext; -using paddle::platform::CreateKey; using paddle::platform::MKLDNNGetDataType; using paddle::platform::MKLDNNMemDesc; using platform::to_void_cast; diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc index a61a3de62f..edf541fde2 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc @@ -21,7 +21,6 @@ namespace operators { using paddle::framework::LoDTensor; using paddle::framework::Tensor; using paddle::platform::CPUDeviceContext; -using paddle::platform::CreateKey; using paddle::platform::MKLDNNGetDataType; using paddle::platform::MKLDNNMemDesc; using platform::to_void_cast; diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 2c03da252d..2f01b59313 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -565,7 +565,7 @@ class ConvMKLDNNHandlerT const auto target_mem_p = this->AcquireMemory(target_key_suffix); user_mem_p->set_data_handle(platform::to_void_cast(in_mem_data)); if (user_mem_p != target_mem_p) { - this->AcquireReorder(user_mem_p, target_mem_p, key_mem); + this->AcquireReorder(user_mem_p, target_mem_p); } return target_mem_p; } @@ -643,7 +643,7 @@ class ConvMKLDNNHandlerT platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) { auto residual_memory_p = this->AcquireResidualMemory(residual_param); dst_memory_p = this->template AcquireDstMemory(output); - this->AcquireReorder(residual_memory_p, dst_memory_p, "@residual_dst"); + this->AcquireReorder(residual_memory_p, dst_memory_p); } else { // Changing ShareDataWith to TensorCopy results in performance drop // on ResNet architectures diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 8d43e9f0dc..b68c950aa9 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -40,151 +40,144 @@ inline mkldnn::memory::dims GetWeightsTz(const Tensor* filter, template class ConvTransposeMKLDNNHandlerT - : public platform::MKLDNNHandlerT { + : public platform::MKLDNNHandlerNoCachingT { public: ConvTransposeMKLDNNHandlerT(const framework::ExecutionContext& ctx, - const platform::MKLDNNDeviceContext& dev_ctx, const mkldnn::engine mkldnn_engine, - platform::Place cpu_place, const Tensor* input, - const Tensor* filter, const Tensor* bias, - Tensor* output, const std::string& unique_name) - : platform::MKLDNNHandlerT( - dev_ctx, mkldnn_engine, cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), - unique_name)) { - if (!this->isCached()) { - const bool is_test = ctx.Attr("is_test"); - PADDLE_ENFORCE_EQ(is_test, true, - platform::errors::InvalidArgument( - "ConvTransposeMKLDNN works only for inference. " - "The attribute \'is_test\' value should be set to " - "True, but got is_test=False.")); - - PADDLE_ENFORCE_EQ( - input->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "Got wrong layout = %d for Input tensor.", input->layout())); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Input tensor. The input " - "format is undefined.")); + const Tensor* input, const Tensor* filter, + const Tensor* bias, Tensor* output) + : platform::MKLDNNHandlerNoCachingT( + mkldnn_engine, ctx.GetPlace()), + is_test_(ctx.Attr("is_test")) { + PADDLE_ENFORCE_EQ(is_test_, true, + platform::errors::InvalidArgument( + "ConvTransposeMKLDNN works only for inference. " + "The attribute \'is_test\' value should be set to " + "True, but got is_test=False.")); + + PADDLE_ENFORCE_EQ( + input->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Got wrong layout = %d for Input tensor.", input->layout())); + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Got wrong format for Input tensor. The input " + "format is undefined.")); + + PADDLE_ENFORCE_EQ( + filter->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "The filter tensor's laytout should be %d, but got %d.", + DataLayout::kMKLDNN, filter->layout())); + PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Got wrong formats for Filter tensor.")); + + PADDLE_ENFORCE_EQ( + input->dims().size(), 4, + platform::errors::InvalidArgument("Input must be with 4 dimensions, " + "i.e. NCHW. but got dimension =%d", + input->dims().size())); + PADDLE_ENFORCE_EQ( + filter->dims().size(), 4, + platform::errors::InvalidArgument("Filter must be with 4 dimensions, " + "i.e. OIHW, but got dimension =%d", + filter->dims().size())); + if (bias) { PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, + bias->layout(), DataLayout::kMKLDNN, platform::errors::InvalidArgument( - "The filter tensor's laytout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); - PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, + "The bias tensor's laytout should be %d, but got %d.", + DataLayout::kMKLDNN, bias->layout())); + PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( - "Got wrong formats for Filter tensor.")); + "Got wrong format for Bias tensor.")); PADDLE_ENFORCE_EQ( - input->dims().size(), 4, - platform::errors::InvalidArgument("Input must be with 4 dimensions, " - "i.e. NCHW. but got dimension =%d", - input->dims().size())); - PADDLE_ENFORCE_EQ( - filter->dims().size(), 4, - platform::errors::InvalidArgument("Filter must be with 4 dimensions, " - "i.e. OIHW, but got dimension =%d", - filter->dims().size())); - - if (bias) { - PADDLE_ENFORCE_EQ( - bias->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The bias tensor's laytout should be %d, but got %d.", - DataLayout::kMKLDNN, bias->layout())); - PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Bias tensor.")); - - PADDLE_ENFORCE_EQ(bias->dims().size(), 1, - platform::errors::InvalidArgument( - "Bias must only have 1 dimension, " - "i.e. X, but got dimension = %d .", - bias->dims().size())); - } - - std::vector strides_temp = ctx.Attr>("strides"); - mkldnn::memory::dims strides(begin(strides_temp), end(strides_temp)); - - std::vector paddings_temp = ctx.Attr>("paddings"); - mkldnn::memory::dims paddings(begin(paddings_temp), end(paddings_temp)); - - std::vector dilations_temp = ctx.Attr>("dilations"); - mkldnn::memory::dims dilations(begin(dilations_temp), - end(dilations_temp)); - - int groups = ctx.Attr("groups"); - std::string padding_algorithm = - ctx.Attr("padding_algorithm"); + bias->dims().size(), 1, + platform::errors::InvalidArgument("Bias must only have 1 dimension, " + "i.e. X, but got dimension = %d .", + bias->dims().size())); + } - PADDLE_ENFORCE_EQ( - strides.size(), 2, - platform::errors::Unimplemented( - "Now we only support 2d oneDNN convolution transpose op")); - - const auto& input_dims = input->dims(); - const auto data_dims = - framework::slice_ddim(input_dims, 2, input_dims.size()); - const auto& filter_dims = filter->dims(); - const auto filter_data_dims = - framework::slice_ddim(filter_dims, 2, filter_dims.size()); - - const auto ksize = framework::vectorize(filter_data_dims); - - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - data_dims, strides, ksize); - - std::transform(dilations.begin(), dilations.end(), dilations.begin(), - [](int64_t i) { return i - 1; }); - - const auto src_tz = framework::vectorize(input->dims()); - const auto weights_tz = GetWeightsTz(filter, groups); - const auto dst_tz = framework::vectorize(output->dims()); - const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); - - /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ - const auto chosen_memory_format = MKLDNNMemoryFormat::any; - const std::string fuse_activation = - ctx.Attr("fuse_activation"); - const float fuse_alpha = ctx.Attr("fuse_alpha"); - const float fuse_beta = ctx.Attr("fuse_beta"); - - auto data_type = mkldnn::memory::data_type::f32; - if (ctx.Attr("mkldnn_data_type") == "bfloat16" || - std::is_same::value) - data_type = mkldnn::memory::data_type::bf16; - - const auto src_md = - platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format); - const auto weights_md = - platform::MKLDNNMemDesc(weights_tz, data_type, chosen_memory_format); - const auto dst_md = platform::MKLDNNMemDesc( - dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - - const mkldnn::primitive_attr conv_trans_attr = - CreatePostOps(fuse_activation, fuse_alpha, fuse_beta); - auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training; - if (bias) { - std::vector bias_tz = framework::vectorize(bias->dims()); - const auto bias_md = - platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x); - this->AcquireForwardPrimitiveDescriptor( - conv_trans_attr, fwd_prop_kind, - dnnl::algorithm::deconvolution_direct, src_md, weights_md, bias_md, - dst_md, strides, dilations, mkldnn_paddings[0], mkldnn_paddings[1]); - } else { - this->AcquireForwardPrimitiveDescriptor( - conv_trans_attr, fwd_prop_kind, - dnnl::algorithm::deconvolution_direct, src_md, weights_md, dst_md, - strides, dilations, mkldnn_paddings[0], mkldnn_paddings[1]); - } + std::vector strides_temp = ctx.Attr>("strides"); + mkldnn::memory::dims strides(begin(strides_temp), end(strides_temp)); + + std::vector paddings_temp = ctx.Attr>("paddings"); + mkldnn::memory::dims paddings(begin(paddings_temp), end(paddings_temp)); + + std::vector dilations_temp = ctx.Attr>("dilations"); + mkldnn::memory::dims dilations(begin(dilations_temp), end(dilations_temp)); + + int groups = ctx.Attr("groups"); + std::string padding_algorithm = ctx.Attr("padding_algorithm"); + + PADDLE_ENFORCE_EQ( + strides.size(), 2, + platform::errors::Unimplemented( + "Now we only support 2d oneDNN convolution transpose op")); + + const auto& input_dims = input->dims(); + const auto data_dims = + framework::slice_ddim(input_dims, 2, input_dims.size()); + const auto& filter_dims = filter->dims(); + const auto filter_data_dims = + framework::slice_ddim(filter_dims, 2, filter_dims.size()); + + const auto ksize = framework::vectorize(filter_data_dims); + + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + data_dims, strides, ksize); + + std::transform(dilations.begin(), dilations.end(), dilations.begin(), + [](int64_t i) { return i - 1; }); + + const auto src_tz = framework::vectorize(input->dims()); + const auto weights_tz = GetWeightsTz(filter, groups); + const auto dst_tz = framework::vectorize(output->dims()); + const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + const auto chosen_memory_format = MKLDNNMemoryFormat::any; + const std::string fuse_activation = + ctx.Attr("fuse_activation"); + const float fuse_alpha = ctx.Attr("fuse_alpha"); + const float fuse_beta = ctx.Attr("fuse_beta"); + + auto data_type = mkldnn::memory::data_type::f32; + if (ctx.Attr("mkldnn_data_type") == "bfloat16" || + std::is_same::value) + data_type = mkldnn::memory::data_type::bf16; + + const auto src_md = + platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format); + const auto weights_md = + platform::MKLDNNMemDesc(weights_tz, data_type, chosen_memory_format); + const auto dst_md = platform::MKLDNNMemDesc( + dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + + const mkldnn::primitive_attr conv_trans_attr = + CreatePostOps(fuse_activation, fuse_alpha, fuse_beta); + auto fwd_prop_kind = is_test_ ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training; + if (bias) { + std::vector bias_tz = framework::vectorize(bias->dims()); + const auto bias_md = + platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x); + this->AcquireForwardPrimitiveDescriptor( + conv_trans_attr, fwd_prop_kind, dnnl::algorithm::deconvolution_direct, + src_md, weights_md, bias_md, dst_md, strides, dilations, + mkldnn_paddings[0], mkldnn_paddings[1]); + } else { + this->AcquireForwardPrimitiveDescriptor( + conv_trans_attr, fwd_prop_kind, dnnl::algorithm::deconvolution_direct, + src_md, weights_md, dst_md, strides, dilations, mkldnn_paddings[0], + mkldnn_paddings[1]); } } @@ -217,86 +210,140 @@ class ConvTransposeMKLDNNHandlerT std::shared_ptr AcquireSrcMemoryWithReorder( const framework::Tensor* input) { const T* input_data = input->data(); - const std::string user_key_suffix{"@src_mem_p_user"}; - auto user_src_mem_p = this->AcquireMemory(user_key_suffix); - if (!user_src_mem_p) { - auto user_src_md = platform::MKLDNNMemDesc( - framework::vectorize(input->dims()), platform::MKLDNNGetDataType(), - input->format()); - return this->AcquireMemoryWithReorder( - user_src_md, this->fwd_pd_->src_desc(), - platform::to_void_cast(input_data), "@src_mem_p"); - } else { - const std::string target_key_suffix{"@src_mem_p_target"}; - const auto target_src_mem_p = this->AcquireMemory(target_key_suffix); - user_src_mem_p->set_data_handle(platform::to_void_cast(input_data)); - if (user_src_mem_p != target_src_mem_p) { - this->AcquireReorder(user_src_mem_p, target_src_mem_p, "@src_mem_p"); - } - return target_src_mem_p; - } + auto user_src_md = platform::MKLDNNMemDesc( + framework::vectorize(input->dims()), platform::MKLDNNGetDataType(), + input->format()); + return platform::MKLDNNHandlerNoCachingT:: + AcquireMemoryWithReorder(user_src_md, this->fwd_pd_->src_desc(), + platform::to_void_cast(input_data)); } std::shared_ptr AcquireWeightsMemoryWithReorder( - const framework::Tensor* filter, const int& groups, const bool& is_test) { - // This is workaround to make execution faster, delete - // if statement after including md inside Tensor - auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target"); - if (is_test && weights_mem_p) { - return weights_mem_p; - } else { - const K* filter_data = filter->data(); - auto weights_tz = GetWeightsTz(filter, groups); - int g = std::max(groups, 1); - - auto user_src_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), - (g == 1) ? filter->format() : MKLDNNMemoryFormat::goihw); - - auto iohw_weights_tz = framework::vectorize(filter->dims()); - // Custom Reorder from IOHW to OIHW - auto iohw2oihw_reorder = - [&iohw_weights_tz](const K* filter_data) -> std::shared_ptr { - int o = iohw_weights_tz[1]; - int c = iohw_weights_tz[0]; - int h = iohw_weights_tz[2]; - int w = iohw_weights_tz[3]; - std::shared_ptr reordered_filter_data(new K[o * c * h * w](), - std::default_delete()); - for (int i = 0; i < c; ++i) { - for (int j = 0; j < o; ++j) { - int in_offset = j * h * w + i * o * h * w; - int out_offset = j * c * h * w + i * h * w; - std::memcpy(&(reordered_filter_data.get())[out_offset], - &filter_data[in_offset], h * w * sizeof(K)); - } + const platform::MKLDNNDeviceContext& dev_ctx, const std::string& key, + const framework::Tensor* filter, const int& groups) { + const K* filter_data = filter->data(); + auto weights_tz = GetWeightsTz(filter, groups); + int g = std::max(groups, 1); + + auto user_src_md = platform::MKLDNNMemDesc( + weights_tz, platform::MKLDNNGetDataType(), + (g == 1) ? filter->format() : MKLDNNMemoryFormat::goihw); + + auto iohw_weights_tz = framework::vectorize(filter->dims()); + // Custom Reorder from IOHW to OIHW + auto iohw2oihw_reorder = + [&iohw_weights_tz](const K* filter_data) -> std::shared_ptr { + int o = iohw_weights_tz[1]; + int c = iohw_weights_tz[0]; + int h = iohw_weights_tz[2]; + int w = iohw_weights_tz[3]; + std::shared_ptr reordered_filter_data(new K[o * c * h * w](), + std::default_delete()); + for (int i = 0; i < c; ++i) { + for (int j = 0; j < o; ++j) { + int in_offset = j * h * w + i * o * h * w; + int out_offset = j * c * h * w + i * h * w; + std::memcpy(&(reordered_filter_data.get())[out_offset], + &filter_data[in_offset], h * w * sizeof(K)); } + } + + return reordered_filter_data; + }; - return reordered_filter_data; - }; + return this->template AcquireMemoryWithReorder( + dev_ctx, user_src_md, this->fwd_pd_->weights_desc(), + platform::to_void_cast(filter_data), key, "@weights_mem_p", is_test_, + iohw2oihw_reorder); + } - return this->template AcquireMemoryWithReorder( - user_src_md, this->fwd_pd_->weights_desc(), - platform::to_void_cast(filter_data), "@weights_mem_p", is_test, - iohw2oihw_reorder); + template + std::shared_ptr AcquireMemoryWithReorder( + const platform::MKLDNNDeviceContext& dev_ctx, + const mkldnn::memory::desc& user_md, + const mkldnn::memory::desc& target_md, void* ptr, const std::string& key, + const std::string& suffix, bool is_persistent = false, + std::function(const F*)> custom_reorder_func = {}, + const std::vector& scale_data = {1.0f}, int mask = 0) { + const auto target_key = key + suffix + "_target"; + const auto key_reorder_p = key + suffix + "reorder_p"; + const auto user_key = key + suffix + "_user"; + + auto target_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(target_key)); + + if (target_memory_p == nullptr) { + if (custom_reorder_func) { + auto reordered_data = + custom_reorder_func(reinterpret_cast(ptr)); + dev_ctx.SetBlob(key_reorder_p + "-custom_reorder", reordered_data); + ptr = reinterpret_cast(reordered_data.get()); + } + auto user_memory_p = + std::make_shared(user_md, this->engine_, ptr); + if (user_md != target_md) { + target_memory_p = + std::make_shared(target_md, this->engine_); + dnnl::reorder::primitive_desc reorder_pdesc; + if (platform::is_int8()) { + dnnl::primitive_attr attr; + attr.set_output_scales(mask, scale_data); + reorder_pdesc = dnnl::reorder::primitive_desc(*user_memory_p, + *target_memory_p, attr); + } else { + reorder_pdesc = + dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p); + } + auto reorder_p = std::make_shared(reorder_pdesc); + dev_ctx.SetBlob(key_reorder_p, reorder_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } + dev_ctx.SetBlob(user_key, user_memory_p); + dev_ctx.SetBlob(target_key, target_memory_p); + } else if (!is_persistent) { + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + auto user_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(user_key)); + user_memory_p->set_data_handle(ptr); + + // TODO(jczaja): Here we detect if reorder is cached it means it is needed + // need to change this to get rid of keys + auto reorder_p = std::static_pointer_cast( + dev_ctx.GetBlob(key_reorder_p)); + if (reorder_p != nullptr) { + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } } + return target_memory_p; } std::shared_ptr AcquireBiasMemoryWithReorder( - const framework::Tensor* bias, const bool& is_test) { - auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target"); - if (is_test && bias_mem_p) { - return bias_mem_p; - } else { - const K* bias_data = bias->data(); - auto user_bias_md = platform::MKLDNNMemDesc( - framework::vectorize(bias->dims()), platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::x); - return this->AcquireMemoryWithReorder( - user_bias_md, this->fwd_pd_->bias_desc(), - platform::to_void_cast(bias_data), "@bias_mem_p", is_test); - } + const platform::MKLDNNDeviceContext& dev_ctx, const std::string& key, + const framework::Tensor* bias) { + const K* bias_data = bias->data(); + auto user_bias_md = platform::MKLDNNMemDesc( + framework::vectorize(bias->dims()), platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::x); + return this->AcquireMemoryWithReorder( + dev_ctx, user_bias_md, this->fwd_pd_->bias_desc(), + platform::to_void_cast(bias_data), key, "@bias_mem_p", is_test_); } + + private: + const bool is_test_; }; template @@ -325,22 +372,21 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel { ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); - const bool is_test = ctx.Attr("is_test"); - const auto* input = ctx.Input("Input"); const auto* filter = ctx.Input("Filter"); const auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; auto* output = ctx.Output("Output"); - const std::string unique_name = ctx.InputName("Input") + - ctx.InputName("Filter") + - (bias ? ctx.InputName("Bias") : ""); - ConvTransposeMKLDNNHandlerT handler( - ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias, - output, unique_name); + ConvTransposeMKLDNNHandlerT handler(ctx, mkldnn_engine, input, + filter, bias, output); auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); + // Caching Key for weights is needed + std::string key = platform::CreateKey(dev_ctx, ctx.InputName("Input"), + ctx.InputName("Filter"), + (bias ? ctx.InputName("Bias") : "")); + key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( - filter, ctx.Attr("groups"), is_test); + dev_ctx, key, filter, ctx.Attr("groups")); std::shared_ptr dst_memory_p = handler.template AcquireDstMemory(output); @@ -352,7 +398,8 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel { {MKLDNN_ARG_DST, *dst_memory_p}}; if (bias) { - auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias, is_test); + auto bias_memory_p = + handler.AcquireBiasMemoryWithReorder(dev_ctx, key, bias); args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index 920ec97a76..9e437fb15e 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -30,234 +30,220 @@ using platform::to_void_cast; template class PoolingMKLDNNHandler - : public platform::MKLDNNHandlerT { + : public platform::MKLDNNHandlerNoCachingT { public: PoolingMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, - const platform::MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, const Tensor* input, - Tensor* output, const std::string& unique_name) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), - framework::ToMKLDNNDataType(input->type()), - unique_name)) { - if (!this->isCached()) { - PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "Wrong layout set for Input tensor.")); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for Input tensor.")); - - const std::string pooling_type = ctx.Attr("pooling_type"); - - std::vector ksize_temp = ctx.Attr>("ksize"); - std::vector ksize(begin(ksize_temp), end(ksize_temp)); - - std::vector strides_temp = ctx.Attr>("strides"); - std::vector strides(begin(strides_temp), end(strides_temp)); - - std::vector paddings_temp = ctx.Attr>("paddings"); - std::vector paddings(begin(paddings_temp), end(paddings_temp)); - - const bool global_pooling = ctx.Attr("global_pooling"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - // Only 2D pooling is supported now - PADDLE_ENFORCE_EQ( - ksize.size(), 2, - platform::errors::InvalidArgument( - "The ksize must be 2D, i.e. 2D pooling, but received %dD.", - ksize.size())); - PADDLE_ENFORCE_EQ( - pooling_type == "max" || pooling_type == "avg", true, - platform::errors::InvalidArgument( - "The pooling_type must be 'max' or 'avg', but received %s.", - pooling_type)); - PADDLE_ENFORCE_EQ( - input->dims().size(), 4, - platform::errors::InvalidArgument( - "Input dim must be with 4, i.e. NCHW, but received %d.", - input->dims().size())); - - const auto input_dims = input->dims(); - framework::DDim data_dims = - framework::slice_ddim(input_dims, 2, input_dims.size()); - - if (global_pooling) { - operators::UpdateKsize(&ksize, data_dims); - } - - operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, - data_dims, strides, ksize); + const mkldnn::engine mkldnn_engine, const Tensor* input, + Tensor* output) + : platform::MKLDNNHandlerNoCachingT( + mkldnn_engine, ctx.GetPlace()) { + PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Wrong layout set for Input tensor.")); + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for Input tensor.")); + + const std::string pooling_type = ctx.Attr("pooling_type"); + + std::vector ksize_temp = ctx.Attr>("ksize"); + std::vector ksize(begin(ksize_temp), end(ksize_temp)); + + std::vector strides_temp = ctx.Attr>("strides"); + std::vector strides(begin(strides_temp), end(strides_temp)); + + std::vector paddings_temp = ctx.Attr>("paddings"); + std::vector paddings(begin(paddings_temp), end(paddings_temp)); + + const bool global_pooling = ctx.Attr("global_pooling"); + const std::string padding_algorithm = + ctx.Attr("padding_algorithm"); + + // Only 2D pooling is supported now + PADDLE_ENFORCE_EQ( + ksize.size(), 2, + platform::errors::InvalidArgument( + "The ksize must be 2D, i.e. 2D pooling, but received %dD.", + ksize.size())); + PADDLE_ENFORCE_EQ( + pooling_type == "max" || pooling_type == "avg", true, + platform::errors::InvalidArgument( + "The pooling_type must be 'max' or 'avg', but received %s.", + pooling_type)); + PADDLE_ENFORCE_EQ( + input->dims().size(), 4, + platform::errors::InvalidArgument( + "Input dim must be with 4, i.e. NCHW, but received %d.", + input->dims().size())); + + const auto input_dims = input->dims(); + framework::DDim data_dims = + framework::slice_ddim(input_dims, 2, input_dims.size()); + + if (global_pooling) { + operators::UpdateKsize(&ksize, data_dims); + } - const auto src_tz = paddle::framework::vectorize(input->dims()); - const auto dst_tz = paddle::framework::vectorize(output->dims()); + operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); - const auto is_test = ctx.Attr("is_test"); + const auto src_tz = paddle::framework::vectorize(input->dims()); + const auto dst_tz = paddle::framework::vectorize(output->dims()); - const auto dt = framework::ToMKLDNNDataType(input->type()); + const auto is_test = ctx.Attr("is_test"); - const auto exclude_padding = ctx.Attr("exclusive"); + const auto dt = framework::ToMKLDNNDataType(input->type()); - const auto src_md = mkldnn::memory::desc(src_tz, dt, input->format()); - /* create memory descriptor for pooling without specified format - * ('any') which lets a primitive (pooling in this case) choose - * the memory format preferred for best performance - */ + const auto exclude_padding = ctx.Attr("exclusive"); - const auto dst_md = - platform::MKLDNNMemDesc(dst_tz, dt, MKLDNNMemoryFormat::any); + const auto src_md = mkldnn::memory::desc(src_tz, dt, input->format()); + /* create memory descriptor for pooling without specified format + * ('any') which lets a primitive (pooling in this case) choose + * the memory format preferred for best performance + */ - auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); + const auto dst_md = + platform::MKLDNNMemDesc(dst_tz, dt, MKLDNNMemoryFormat::any); - const bool ceil_mode = ctx.Attr("ceil_mode"); + auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); - if (ceil_mode) { - CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, - mkldnn_paddings[1]); - } + const bool ceil_mode = ctx.Attr("ceil_mode"); - ComputeAdaptivePoolParameters(ctx, src_tz, &ksize, &strides); - - this->AcquireForwardPrimitiveDescriptor( - is_test ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training, - pooling_type == "max" - ? mkldnn::algorithm::pooling_max - : (exclude_padding - ? mkldnn::algorithm::pooling_avg_exclude_padding - : mkldnn::algorithm::pooling_avg_include_padding), - src_md, dst_md, strides, ksize, mkldnn_paddings[0], - mkldnn_paddings[1]); + if (ceil_mode) { + CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, + mkldnn_paddings[1]); } + + ComputeAdaptivePoolParameters(ctx, src_tz, &ksize, &strides); + + this->AcquireForwardPrimitiveDescriptor( + is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training, + pooling_type == "max" + ? mkldnn::algorithm::pooling_max + : (exclude_padding + ? mkldnn::algorithm::pooling_avg_exclude_padding + : mkldnn::algorithm::pooling_avg_include_padding), + src_md, dst_md, strides, ksize, mkldnn_paddings[0], mkldnn_paddings[1]); } PoolingMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, - const platform::MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, const Tensor* in_x, - const Tensor* out_grad, Tensor* in_x_grad, - const std::string& unique_name) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()), - framework::ToMKLDNNDataType(in_x->type()), - unique_name)) { - if (!this->isBwdCached()) { - PADDLE_ENFORCE_EQ(in_x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "Wrong layout set for Input tensor")); - PADDLE_ENFORCE_NE(in_x->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for Input tensor")); - - PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "Wrong layout set for Input output_grad tensor")); - PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for Input output_grad tensor")); - - PADDLE_ENFORCE_EQ( - ctx.Attr("is_test"), false, - platform::errors::InvalidArgument( - "is_test attribute should be set to False in training phase.")); - - std::string pooling_type = ctx.Attr("pooling_type"); - - std::vector ksize_temp = ctx.Attr>("ksize"); - std::vector ksize(begin(ksize_temp), end(ksize_temp)); - - std::vector strides_temp = ctx.Attr>("strides"); - std::vector strides(begin(strides_temp), end(strides_temp)); - - std::vector paddings_temp = ctx.Attr>("paddings"); - std::vector paddings(begin(paddings_temp), end(paddings_temp)); - - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - auto in_x_dims = in_x->dims(); - framework::DDim data_dims = - framework::slice_ddim(in_x_dims, 2, in_x_dims.size()); - - if (global_pooling) { - operators::UpdateKsize(&ksize, data_dims); - } + const mkldnn::engine mkldnn_engine, const Tensor* in_x, + const Tensor* out_grad, Tensor* in_x_grad) + + : platform::MKLDNNHandlerNoCachingT( + mkldnn_engine, ctx.GetPlace()) { + PADDLE_ENFORCE_EQ( + in_x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for Input tensor")); + PADDLE_ENFORCE_NE( + in_x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for Input tensor")); + + PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Wrong layout set for Input output_grad tensor")); + PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for Input output_grad tensor")); + + PADDLE_ENFORCE_EQ( + ctx.Attr("is_test"), false, + platform::errors::InvalidArgument( + "is_test attribute should be set to False in training phase.")); + + std::string pooling_type = ctx.Attr("pooling_type"); + + std::vector ksize_temp = ctx.Attr>("ksize"); + std::vector ksize(begin(ksize_temp), end(ksize_temp)); + + std::vector strides_temp = ctx.Attr>("strides"); + std::vector strides(begin(strides_temp), end(strides_temp)); + + std::vector paddings_temp = ctx.Attr>("paddings"); + std::vector paddings(begin(paddings_temp), end(paddings_temp)); + + bool global_pooling = ctx.Attr("global_pooling"); + std::string padding_algorithm = ctx.Attr("padding_algorithm"); + + auto in_x_dims = in_x->dims(); + framework::DDim data_dims = + framework::slice_ddim(in_x_dims, 2, in_x_dims.size()); + + if (global_pooling) { + operators::UpdateKsize(&ksize, data_dims); + } - operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, - data_dims, strides, ksize); - - auto src_tz = paddle::framework::vectorize(in_x->dims()); - auto diff_src_tz = - paddle::framework::vectorize(in_x_grad->dims()); - auto diff_dst_tz = - paddle::framework::vectorize(out_grad->dims()); - - const auto dt = framework::ToMKLDNNDataType(in_x->type()); - auto src_md = mkldnn::memory::desc(src_tz, dt, in_x->format()); - auto dst_md = - mkldnn::memory::desc(diff_dst_tz, dt, MKLDNNMemoryFormat::any); - auto diff_dst_md = mkldnn::memory::desc( - diff_dst_tz, platform::MKLDNNGetDataType(), out_grad->format()); - auto diff_src_md = - mkldnn::memory::desc(diff_src_tz, platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); - - auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); - const bool ceil_mode = ctx.Attr("ceil_mode"); - - if (ceil_mode) { - CorrectOutputSize(src_tz, diff_dst_tz, ksize, paddings, strides, - mkldnn_paddings[1]); - } - ComputeAdaptivePoolParameters(ctx, diff_src_tz, &ksize, &strides); - - const auto exclude_padding = ctx.Attr("exclusive"); - - this->AcquireForwardPrimitiveDescriptor( - mkldnn::prop_kind::forward_training, - pooling_type == "max" - ? mkldnn::algorithm::pooling_max - : (exclude_padding - ? mkldnn::algorithm::pooling_avg_exclude_padding - : mkldnn::algorithm::pooling_avg_include_padding), - src_md, dst_md, strides, ksize, mkldnn_paddings[0], - mkldnn_paddings[1]); - - this->AcquireBackwardPrimitiveDescriptor( - pooling_type == "max" - ? mkldnn::algorithm::pooling_max - : (exclude_padding - ? mkldnn::algorithm::pooling_avg_exclude_padding - : mkldnn::algorithm::pooling_avg_include_padding), - diff_src_md, diff_dst_md, strides, ksize, mkldnn_paddings[0], - mkldnn_paddings[1]); + operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); + + auto src_tz = paddle::framework::vectorize(in_x->dims()); + auto diff_src_tz = paddle::framework::vectorize(in_x_grad->dims()); + auto diff_dst_tz = paddle::framework::vectorize(out_grad->dims()); + + const auto dt = framework::ToMKLDNNDataType(in_x->type()); + auto src_md = mkldnn::memory::desc(src_tz, dt, in_x->format()); + auto dst_md = + mkldnn::memory::desc(diff_dst_tz, dt, MKLDNNMemoryFormat::any); + auto diff_dst_md = mkldnn::memory::desc( + diff_dst_tz, platform::MKLDNNGetDataType(), out_grad->format()); + auto diff_src_md = mkldnn::memory::desc( + diff_src_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::any); + + auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); + const bool ceil_mode = ctx.Attr("ceil_mode"); + + if (ceil_mode) { + CorrectOutputSize(src_tz, diff_dst_tz, ksize, paddings, strides, + mkldnn_paddings[1]); } + ComputeAdaptivePoolParameters(ctx, diff_src_tz, &ksize, &strides); + + const auto exclude_padding = ctx.Attr("exclusive"); + + this->AcquireForwardPrimitiveDescriptor( + mkldnn::prop_kind::forward_training, + pooling_type == "max" + ? mkldnn::algorithm::pooling_max + : (exclude_padding + ? mkldnn::algorithm::pooling_avg_exclude_padding + : mkldnn::algorithm::pooling_avg_include_padding), + src_md, dst_md, strides, ksize, mkldnn_paddings[0], mkldnn_paddings[1]); + + this->AcquireBackwardPrimitiveDescriptor( + pooling_type == "max" + ? mkldnn::algorithm::pooling_max + : (exclude_padding + ? mkldnn::algorithm::pooling_avg_exclude_padding + : mkldnn::algorithm::pooling_avg_include_padding), + diff_src_md, diff_dst_md, strides, ksize, mkldnn_paddings[0], + mkldnn_paddings[1]); } - std::shared_ptr AcquireWorkspaceMemory(void) { + std::shared_ptr AcquireWorkspaceMemory( + const platform::MKLDNNDeviceContext& dev_ctx, + const std::string& unique_name) { mkldnn::memory::desc workspace_md = this->fwd_pd_->workspace_desc(); - // Pooling PD has to be passed to Grad op that + // Pooling Workspace has to be passed to Grad op that // may be executed by diffrent thread, hence // for that one we use key that does not contain TID - auto local_key = this->key_common_ + "@workspace"; + std::string workspace_key = + platform::CreateKey(dev_ctx, workspace_md.dims(), + workspace_md.data_type(), unique_name, "@wrk"); auto mem_p = std::static_pointer_cast( - this->dev_ctx_.GetBlob(local_key)); + dev_ctx.GetBlob(workspace_key)); if (mem_p == nullptr) { static std::mutex acquire_barrier; std::lock_guard block_threads_until_finish_this_job( acquire_barrier); mem_p = std::static_pointer_cast( - this->dev_ctx_.GetBlob(local_key)); + dev_ctx.GetBlob(workspace_key)); if (mem_p == nullptr) { mem_p = std::make_shared(workspace_md, this->engine_); - this->dev_ctx_.SetBlob(local_key, mem_p); + dev_ctx.SetBlob(workspace_key, mem_p); } } return mem_p; @@ -319,8 +305,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { const Tensor* input = ctx.Input("X"); Tensor* output = ctx.Output("Out"); - PoolingMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), input, output, - ctx.OutputName("Out")); + PoolingMKLDNNHandler handler(ctx, dev_ctx.GetEngine(), input, output); auto src_memory = handler.AcquireSrcMemory(input); auto dst_memory = handler.AcquireDstMemory(output); @@ -331,7 +316,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { if ((ctx.Attr("is_test") == false) && (ctx.Attr("pooling_type") == "max")) { // Training - auto workspace_memory = handler.AcquireWorkspaceMemory(); + auto workspace_memory = + handler.AcquireWorkspaceMemory(dev_ctx, ctx.OutputName("Out")); pool_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, {MKLDNN_ARG_DST, *dst_memory}, {MKLDNN_ARG_WORKSPACE, *workspace_memory}}); @@ -361,8 +347,8 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto& dev_ctx = ctx.template device_context(); - PoolingMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), in_x, - out_grad, in_x_grad, ctx.InputName("Out")); + PoolingMKLDNNHandler handler(ctx, dev_ctx.GetEngine(), in_x, out_grad, + in_x_grad); auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad); auto diff_src_memory = handler.AcquireDiffSrcMemory(in_x_grad); @@ -372,7 +358,8 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (ctx.Attr("pooling_type") == "max") { // Max - pooling needs Workspace - auto workspace_memory = handler.AcquireWorkspaceMemory(); + auto workspace_memory = + handler.AcquireWorkspaceMemory(dev_ctx, ctx.InputName("Out")); pool_bwd_p->execute(astream, {{MKLDNN_ARG_DIFF_SRC, *diff_src_memory}, {MKLDNN_ARG_DIFF_DST, *diff_dst_memory}, {MKLDNN_ARG_WORKSPACE, *workspace_memory}}); diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc index 819c0d1550..815af4eaaf 100644 --- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc @@ -64,81 +64,46 @@ class QuantOpKernel : public framework::OpKernel { bool is_negative_input = ctx.Attr("is_negative_input"); bool bfloat16 = ctx.Attr("bfloat16"); - std::string key = - platform::CreateKey(dev_ctx, src_tz, scale_data, scale_shift, - is_negative_input, ctx.OutputName("Output")); - key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - - const std::string key_prim = key + "@r"; - const std::string key_src_mem = key + "@s"; - const std::string key_dst_mem = key + "@d"; - + // TODO(jczaja): Refactor with Acquire API std::shared_ptr src_memory; std::shared_ptr dst_memory; std::shared_ptr reorder_p; - reorder_p = std::static_pointer_cast(dev_ctx.GetBlob(key_prim)); - - if (reorder_p == nullptr) { - std::string out_layout = ctx.Attr("output_format"); - MKLDNNMemoryFormat out_format = - platform::data_format_to_memory_format(out_layout); - mkldnn::primitive_attr attri; - int mask = 0; - attri.set_output_scales(mask, {scale_data}); - - if (with_shift) { - mkldnn::post_ops post_operations; - post_operations.append_sum(); - attri.set_post_ops(post_operations); - uint8_t* output_data = output->mutable_data(ctx.GetPlace()); - // memset casts scale_shift to unsigned char (uint8_t) internally - std::memset(output_data, scale_shift, output->numel()); - } - - auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32, - input->format()); - src_memory = std::make_shared( - src_md, engine, to_void_cast(input_data)); - - std::shared_ptr dst_md; - if (bfloat16) { - platform::SetDstMemoryQuantized( - ctx, output, dst_tz, engine, dst_md, dst_memory, out_format); - } else if (is_negative_input && !with_shift) { - platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, - dst_md, dst_memory, out_format); - } else { - platform::SetDstMemoryQuantized( - ctx, output, dst_tz, engine, dst_md, dst_memory, out_format); - } - auto reorder_pd = std::shared_ptr( - new reorder::primitive_desc(*src_memory, *dst_memory, attri)); - reorder_p = std::shared_ptr(new reorder(*reorder_pd)); - - dev_ctx.SetBlob(key_prim, reorder_p); - dev_ctx.SetBlob(key_src_mem, src_memory); - dev_ctx.SetBlob(key_dst_mem, dst_memory); + + std::string out_layout = ctx.Attr("output_format"); + MKLDNNMemoryFormat out_format = + platform::data_format_to_memory_format(out_layout); + mkldnn::primitive_attr attri; + int mask = 0; + attri.set_output_scales(mask, {scale_data}); + + if (with_shift) { + mkldnn::post_ops post_operations; + post_operations.append_sum(); + attri.set_post_ops(post_operations); + uint8_t* output_data = output->mutable_data(ctx.GetPlace()); + // memset casts scale_shift to unsigned char (uint8_t) internally + std::memset(output_data, scale_shift, output->numel()); + } + + auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32, + input->format()); + src_memory = std::make_shared(src_md, engine, + to_void_cast(input_data)); + + std::shared_ptr dst_md; + if (bfloat16) { + platform::SetDstMemoryQuantized( + ctx, output, dst_tz, engine, dst_md, dst_memory, out_format); + } else if (is_negative_input && !with_shift) { + platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, + dst_md, dst_memory, out_format); } else { - src_memory = std::static_pointer_cast( - dev_ctx.GetBlob(key_src_mem)); - src_memory->set_data_handle(to_void_cast(input_data)); - - dst_memory = std::static_pointer_cast( - dev_ctx.GetBlob(key_dst_mem)); - auto place = ctx.GetPlace(); - - if (bfloat16) { - dst_memory->set_data_handle( - output->mutable_data(place)); - } else if (with_shift || !is_negative_input) { - uint8_t* output_data = output->mutable_data(ctx.GetPlace()); - if (with_shift) std::memset(output_data, scale_shift, output->numel()); - dst_memory->set_data_handle(output_data); - } else { - dst_memory->set_data_handle( - output->mutable_data(ctx.GetPlace())); - } + platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, + dst_md, dst_memory, out_format); } + auto reorder_pd = std::shared_ptr( + new reorder::primitive_desc(*src_memory, *dst_memory, attri)); + reorder_p = std::shared_ptr(new reorder(*reorder_pd)); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 2ab2de1c1f..2bb08bcf81 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -207,7 +207,7 @@ class MKLDNNHandlerNoCachingT { std::shared_ptr AcquireMemoryWithReorder( const mkldnn::memory::desc& user_md, const mkldnn::memory::desc& target_md, void* ptr, - const std::string& suffix, bool is_persistent = false, + bool is_persistent = false, std::function(const F*)> custom_reorder_func = {}) { std::shared_ptr target_memory_p; if (custom_reorder_func) { @@ -500,18 +500,9 @@ class MKLDNNHandlerT { } void AcquireReorder(const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p, - const std::string& suffix) { - const auto key_reorder_p = key_ + suffix + "reorder_p"; - - auto reorder_p = std::static_pointer_cast( - dev_ctx_.GetBlob(key_reorder_p)); - - if (reorder_p == nullptr) { - reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - dev_ctx_.SetBlob(key_reorder_p, reorder_p); - } + const std::shared_ptr& target_memory_p) { + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); @@ -578,6 +569,8 @@ class MKLDNNHandlerT { std::static_pointer_cast(dev_ctx_.GetBlob(user_key)); user_memory_p->set_data_handle(ptr); + // TODO(jczaja): Here we detect if reorder is cached it means it is needed + // need to change this to get rid of keys auto reorder_p = std::static_pointer_cast( dev_ctx_.GetBlob(key_reorder_p)); if (reorder_p != nullptr) { diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py index ca25b849b4..dcaee49558 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py @@ -95,4 +95,6 @@ class TestConv3DOp_Valid_MKLDNN(TestConv3DOp_AsyPadding_MKLDNN): if __name__ == '__main__': + from paddle import enable_static + enable_static() unittest.main() -- GitLab