From 56008aa190ba48ebc99c0fc7e7e0b6899154de74 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Wed, 19 May 2021 05:15:22 +0200 Subject: [PATCH] [oneDNN] Pool softmax and LRN access to cache optimized (#32922) --- .../fluid/operators/mkldnn/lrn_mkldnn_op.cc | 133 ++++++++++++++---- .../fluid/operators/mkldnn/pool_mkldnn_op.cc | 24 +++- .../operators/mkldnn/softmax_mkldnn_op.cc | 12 +- paddle/fluid/platform/mkldnn_reuse.h | 124 ++++++---------- .../unittests/mkldnn/test_lrn_mkldnn_op.py | 2 + 5 files changed, 175 insertions(+), 120 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index e2e9d280027..b6b0b486bf0 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -14,21 +14,104 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_reuse.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -namespace platform { -class MKLDNNDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { using paddle::framework::Tensor; using paddle::platform::MKLDNNDeviceContext; +template +class LRNMKLDNNHandler : public platform::MKLDNNHandlerT { + public: + LRNMKLDNNHandler(const framework::ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, + platform::Place cpu_place, const Tensor* input, + const std::string& unique_name) + + : platform::MKLDNNHandlerT( + dev_ctx, mkldnn_engine, cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), + unique_name)) { + if (!this->isCachedNonBlocking()) { + const int n = ctx.Attr("n"); + // MKL-DNN implements LRN in a caffe way: + // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html + // Where sum of squares is divided by size of normalization window + // this is not the case for PaddlePaddle LRN. + // Hence we need to compensate for this diffrence by + // multipliing alpha by size of window(n) + const float alpha = ctx.Attr("alpha") * static_cast(n); + const float beta = ctx.Attr("beta"); + const float k = ctx.Attr("k"); + bool is_test = ctx.Attr("is_test"); + + auto dims = framework::vectorize(input->dims()); + + auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), + input->format()); + + this->AcquireForwardPrimitiveDescriptorNonBlocking( + is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training, + mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); + } + } + + LRNMKLDNNHandler(const framework::ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, + platform::Place cpu_place, const Tensor* in_x, + const Tensor* out_grad, Tensor* in_x_grad, + const std::string& unique_name) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()), + unique_name)) { + if (!this->isBwdCached()) { + PADDLE_ENFORCE_EQ( + ctx.Attr("is_test"), false, + platform::errors::PreconditionNotMet( + "is_test attribute should be set to False in training phase.")); + + const int n = ctx.Attr("n"); + const float alpha = ctx.Attr("alpha") * static_cast(n); + const float beta = ctx.Attr("beta"); + const float k = ctx.Attr("k"); + + auto dims = framework::vectorize(in_x->dims()); + + auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), + in_x->format()); + auto diff_md = mkldnn::memory::desc( + dims, platform::MKLDNNGetDataType(), out_grad->format()); + + this->AcquireForwardPrimitiveDescriptorNonBlocking( + mkldnn::prop_kind::forward_training, + mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); + + this->AcquireBackwardPrimitiveDescriptorNonBlocking( + mkldnn::algorithm::lrn_across_channels, src_md, diff_md, n, alpha, + beta, k); + } + } + + std::shared_ptr AcquireWorkspaceMemory(Tensor* workspace) { + T* ptr = workspace->mutable_data( + this->place_, this->fwd_pd_->workspace_desc().get_size()); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(), + ptr, "@wrk_mem_p"); + } + + std::shared_ptr AcquireBackwardWorkspaceMemory( + const Tensor* workspace) { + const T* workspace_data = workspace->data(); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->workspace_desc(), + platform::to_void_cast(workspace_data), "@bwd-wrk_mem_p"); + } +}; + template class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -48,8 +131,8 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto out = ctx.Output("Out"); auto mid = ctx.Output("MidOut"); - platform::LRNMKLDNNHandler handler( - ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, ctx.OutputName("Out")); + LRNMKLDNNHandler handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, + ctx.OutputName("Out")); auto src_memory = handler.AcquireSrcMemory(x); auto dst_memory = handler.AcquireDstMemory(out); @@ -87,34 +170,22 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, paddle::platform::errors::PreconditionNotMet( "Operator DNNL LRNGrad must use CPUPlace")); - PADDLE_ENFORCE_EQ( - ctx.Attr("is_test"), false, - platform::errors::PreconditionNotMet( - "is_test attribute should be set to False in training phase.")); - auto x = ctx.Input("X"); + auto in_x = ctx.Input("X"); auto mid = ctx.Input("MidOut"); auto out_grad = ctx.Input(framework::GradVarName("Out")); - auto x_grad = ctx.Output(framework::GradVarName("X")); - - const int n = ctx.Attr("n"); - const float alpha = ctx.Attr("alpha") * static_cast(n); - const float beta = ctx.Attr("beta"); - const float k = ctx.Attr("k"); + auto in_x_grad = ctx.Output(framework::GradVarName("X")); auto& dev_ctx = ctx.template device_context(); - auto dims = paddle::framework::vectorize(x->dims()); + LRNMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), in_x, out_grad, + in_x_grad, ctx.InputName("Out")); - platform::LRNMKLDNNHandler handler(dims, n, alpha, beta, k, x->format(), - out_grad->format(), dev_ctx, - ctx.GetPlace(), ctx.InputName("Out")); - - auto src_memory = handler.AcquireSrcMemory(x); + auto src_memory = handler.AcquireSrcMemory(in_x); auto workspace = handler.AcquireBackwardWorkspaceMemory(mid); auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad); - auto diff_src_memory = handler.AcquireDiffSrcMemory(x_grad); + auto diff_src_memory = handler.AcquireDiffSrcMemory(in_x_grad); auto lrn_bwd = handler.AcquireBackwardPrimitive(); @@ -125,8 +196,8 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { {MKLDNN_ARG_WORKSPACE, *workspace}}); astream.wait(); - x_grad->set_layout(framework::DataLayout::kMKLDNN); - x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory)); + in_x_grad->set_layout(framework::DataLayout::kMKLDNN); + in_x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory)); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index b7bed95b1d3..04e0bcbfc7c 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -43,7 +43,7 @@ class PoolingMKLDNNHandler platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), framework::ToMKLDNNDataType(input->type()), unique_name)) { - if (!this->isCached()) { + if (!this->isCachedNonBlocking()) { PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, platform::errors::InvalidArgument( "Wrong layout set for Input tensor.")); @@ -100,11 +100,10 @@ class PoolingMKLDNNHandler const auto is_test = ctx.Attr("is_test"); const auto dt = framework::ToMKLDNNDataType(input->type()); - const auto fmt = input->format(); const auto exclude_padding = ctx.Attr("exclusive"); - const auto src_md = mkldnn::memory::desc(src_tz, dt, fmt); + const auto src_md = mkldnn::memory::desc(src_tz, dt, input->format()); /* create memory descriptor for pooling without specified format * ('any') which lets a primitive (pooling in this case) choose * the memory format preferred for best performance @@ -124,7 +123,7 @@ class PoolingMKLDNNHandler ComputeAdaptivePoolParameters(ctx, src_tz, &ksize, &strides); - this->AcquireForwardPrimitiveDescriptor( + this->AcquireForwardPrimitiveDescriptorNonBlocking( is_test ? mkldnn::prop_kind::forward_inference : mkldnn::prop_kind::forward_training, pooling_type == "max" @@ -200,6 +199,10 @@ class PoolingMKLDNNHandler auto diff_dst_tz = paddle::framework::vectorize(out_grad->dims()); + const auto dt = framework::ToMKLDNNDataType(in_x->type()); + auto src_md = mkldnn::memory::desc(src_tz, dt, in_x->format()); + auto dst_md = + mkldnn::memory::desc(diff_dst_tz, dt, MKLDNNMemoryFormat::any); auto diff_dst_md = mkldnn::memory::desc( diff_dst_tz, platform::MKLDNNGetDataType(), out_grad->format()); auto diff_src_md = @@ -216,7 +219,18 @@ class PoolingMKLDNNHandler ComputeAdaptivePoolParameters(ctx, diff_src_tz, &ksize, &strides); const auto exclude_padding = ctx.Attr("exclusive"); - this->AcquireBackwardPrimitiveDescriptor( + + this->AcquireForwardPrimitiveDescriptorNonBlocking( + mkldnn::prop_kind::forward_training, + pooling_type == "max" + ? mkldnn::algorithm::pooling_max + : (exclude_padding + ? mkldnn::algorithm::pooling_avg_exclude_padding + : mkldnn::algorithm::pooling_avg_include_padding), + src_md, dst_md, strides, ksize, mkldnn_paddings[0], + mkldnn_paddings[1]); + + this->AcquireBackwardPrimitiveDescriptorNonBlocking( pooling_type == "max" ? mkldnn::algorithm::pooling_max : (exclude_padding diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index 4a55945936e..1d177e120b5 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -50,7 +50,7 @@ class SoftmaxMKLDNNHandler : platform::CreateKey( dev_ctx, framework::vectorize(input->dims()), uniq_name)) { - if (!this->isCached()) { + if (!this->isCachedNonBlocking()) { PADDLE_ENFORCE_EQ( input->dims(), output->dims(), platform::errors::InvalidArgument( @@ -60,8 +60,8 @@ class SoftmaxMKLDNNHandler auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), input->format()); - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, - axis); + this->AcquireForwardPrimitiveDescriptorNonBlocking( + prop_kind::forward_scoring, md, axis); } } @@ -90,8 +90,10 @@ class SoftmaxMKLDNNHandler auto diff_softmax_md = MKLDNNMemDesc( softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); - this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, - axis); + this->AcquireForwardPrimitiveDescriptorNonBlocking( + prop_kind::forward_scoring, data_softmax_md, axis); + this->AcquireBackwardPrimitiveDescriptorNonBlocking( + diff_softmax_md, data_softmax_md, axis); } } }; diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index e584b849368..5ff6f893a89 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -126,13 +126,20 @@ class MKLDNNHandlerT { return (dev_ctx_.GetBlob(key_p) != nullptr); } + bool isCachedNonBlocking() { + const std::string key_pd = key_ + "@fwd_pd"; + fwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + + return (fwd_pd_ != nullptr); + } + bool isBwdCached() { - const std::string key_pd = key_common_ + "@bwd_pd"; + const std::string key_pd = key_ + "@bwd_pd"; bwd_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_pd)); - const std::string key_p = key_ + "@bwd_p"; - return (dev_ctx_.GetBlob(key_p) != nullptr); + return (bwd_pd_ != nullptr); } // If your primitive descriptor requires attributes, pass them as a @@ -161,6 +168,20 @@ class MKLDNNHandlerT { } } + template + void AcquireForwardPrimitiveDescriptorNonBlocking(Arg&& first_arg, + Args&&... args) { + // This is used when we can recreate FWD PD in BWD so + // we do not need to pass FWD to BWD + const std::string key_pd = key_ + "@fwd_pd"; + fwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + if (fwd_pd_ == nullptr) { + CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); + dev_ctx_.SetBlob(key_pd, fwd_pd_); + } + } + // Using sfinae to specialise variadic function. Workaround for not having // if constexpr in C++ 11. template @@ -182,6 +203,8 @@ class MKLDNNHandlerT { std::make_shared(fwd_desc, engine_); } + // TODO(jczaja): After/if all ops can used xxxNonBlocking version + // then remove this one template void AcquireBackwardPrimitiveDescriptor(Args&&... args) { const std::string key_fwd_pd = key_common_ + "@fwd_pd"; @@ -201,6 +224,25 @@ class MKLDNNHandlerT { } } + template + void AcquireBackwardPrimitiveDescriptorNonBlocking(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptorNonBlocking + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, + platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.", + key_ + "@fwd_pd")); + const std::string key_pd = key_ + "@bwd_pd"; + bwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + if (bwd_pd_ == nullptr) { + auto bwd_desc = typename TBackward::desc(std::forward(args)...); + bwd_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + dev_ctx_.SetBlob(key_pd, bwd_pd_); + } + } + std::shared_ptr AcquireMemoryFromPrimitive( const std::string& suffix) { return std::static_pointer_cast( @@ -781,82 +823,6 @@ class ActivationMKLDNNHandler } }; -template -class LRNMKLDNNHandler - : public MKLDNNHandlerT { - public: - LRNMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, - const platform::MKLDNNDeviceContext& dev_ctx, - const mkldnn::engine mkldnn_engine, - platform::Place cpu_place, const Tensor* input, - const std::string& unique_name) - - : platform::MKLDNNHandlerT( - dev_ctx, mkldnn_engine, cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), - unique_name)) { - if (!this->isCached()) { - const int n = ctx.Attr("n"); - // MKL-DNN implements LRN in a caffe way: - // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html - // Where sum of squares is divided by size of normalization window - // this is not the case for PaddlePaddle LRN. - // Hence we need to compensate for this diffrence by - // multipliing alpha by size of window(n) - const float alpha = ctx.Attr("alpha") * static_cast(n); - const float beta = ctx.Attr("beta"); - const float k = ctx.Attr("k"); - bool is_test = ctx.Attr("is_test"); - - auto dims = paddle::framework::vectorize(input->dims()); - - auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), - input->format()); - - this->AcquireForwardPrimitiveDescriptor( - is_test ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training, - mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); - } - } - - LRNMKLDNNHandler(const std::vector& dims, const int n, - const float alpha, const float beta, const float k, - const MKLDNNMemoryFormat fmt, - const MKLDNNMemoryFormat diff_fmt, - const platform::MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, const std::string& unique_name) - - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, dims, unique_name)) { - auto src_md = - mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); - auto diff_md = - mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), diff_fmt); - - this->AcquireBackwardPrimitiveDescriptor( - mkldnn::algorithm::lrn_across_channels, src_md, diff_md, n, alpha, beta, - k); - } - - std::shared_ptr AcquireWorkspaceMemory( - framework::Tensor* workspace) { - T* ptr = workspace->mutable_data( - this->place_, this->fwd_pd_->workspace_desc().get_size()); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(), - ptr, "@wrk_mem_p"); - } - - std::shared_ptr AcquireBackwardWorkspaceMemory( - const framework::Tensor* workspace) { - const T* workspace_data = workspace->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(), - to_void_cast(workspace_data), - "@bwd-wrk_mem_p"); - } -}; - template class TransposeMKLDNNHandler : public MKLDNNHandler { public: diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py index ba7c8abc56d..088b4fb5905 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py @@ -63,4 +63,6 @@ class TestLRNMKLDNNOpNHWC(TestLRNMKLDNNOp): if __name__ == "__main__": + from paddle import enable_static + enable_static() unittest.main() -- GitLab