From b490e41c1d60cd042d851dddcae05c8a1d0206b1 Mon Sep 17 00:00:00 2001 From: Adam <38704900+grygielski@users.noreply.github.com> Date: Mon, 1 Jun 2020 16:05:37 +0200 Subject: [PATCH] Add isCached() mechanism for BatchNorm and LRN oneDNN operators (#24798) * Add isCached() mechanism for BatchNorm and LRN oneDNN operators test=develop * Formatting fix test=develop --- .../operators/mkldnn/batch_norm_mkldnn_op.cc | 139 ++++++++++-------- .../fluid/operators/mkldnn/lrn_mkldnn_op.cc | 24 +-- paddle/fluid/platform/mkldnn_reuse.h | 62 ++++++-- 3 files changed, 130 insertions(+), 95 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index fde4900c6d3..33cf00b2c01 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -31,22 +31,45 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandlerT { public: - BatchNormMKLDNNHandler(const std::vector &dims, const float &epsilon, - const mkldnn::normalization_flags &flags, - const bool &global_stats, const MKLDNNMemoryFormat fmt, + BatchNormMKLDNNHandler(const paddle::framework::ExecutionContext &ctx, const platform::MKLDNNDeviceContext &dev_ctx, - platform::Place cpu_place, - const std::string &uniq_name) + const mkldnn::engine mkldnn_engine, + platform::Place cpu_place, const Tensor *x, + const bool global_stats, const bool test_mode, + const std::string &unique_name) : platform::MKLDNNHandlerT( dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dims, uniq_name)) { - auto md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); - - this->AcquireForwardPrimitiveDescriptor( - global_stats == true ? mkldnn::prop_kind::forward_scoring - : mkldnn::prop_kind::forward_training, - md, epsilon, flags); + platform::CreateKey(framework::vectorize(x->dims()), unique_name)) { + if (!this->isCached()) { + const float epsilon = ctx.Attr("epsilon"); + const bool fuse_with_relu = ctx.Attr("fuse_with_relu"); + + PADDLE_ENFORCE_EQ( + x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for X tensor")); + PADDLE_ENFORCE_NE( + x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for X tensor")); + + auto src_tz = paddle::framework::vectorize(x->dims()); + + // Flags are added by bitwise OR operation + auto flags = mkldnn::normalization_flags::use_scale_shift; // 001 + if (global_stats) + flags |= mkldnn::normalization_flags::use_global_stats; // 010 + if (fuse_with_relu && test_mode) + flags |= mkldnn::normalization_flags::fuse_norm_relu; // 100 + + auto md = mkldnn::memory::desc( + src_tz, platform::MKLDNNGetDataType(), + platform::MKLDNNFormatForSize(src_tz.size(), x->format())); + + this->AcquireForwardPrimitiveDescriptor( + global_stats == true ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training, + md, epsilon, flags); + } } BatchNormMKLDNNHandler(const std::vector &dims, const float &epsilon, const mkldnn::normalization_flags &flags, @@ -68,9 +91,30 @@ class BatchNormMKLDNNHandler mkldnn::prop_kind::backward, diff_dst_md, src_md, epsilon, flags); } - std::shared_ptr AcquireScaleShiftMemory(T *scaleshift_data) { - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->weights_desc(), scaleshift_data, "@scaleshift_mem_p"); + std::shared_ptr AcquireScaleShiftMemory(const Tensor *scale, + const Tensor *shift, + const bool is_test) { + auto scaleshift_memory = this->AcquireMemory("@scaleshift_mem_p"); + if (scaleshift_memory == nullptr || !is_test) { + auto scale_tz = paddle::framework::vectorize(scale->dims()); + const unsigned int C = scale_tz[0]; + PADDLE_ENFORCE_EQ( + scale_tz.size(), 1, + platform::errors::InvalidArgument( + "Dims of scale tensor must be 1, but received scale's size is %d", + scale_tz.size())); + + auto mem_p = this->AcquireMemoryFromPrimitive( + this->fwd_pd_->weights_desc(), "@scaleshift_mem_p"); + + // MKLDNN requires a single piece of memory for scale and shift/bias data + auto mem_data_handle = reinterpret_cast(mem_p->get_data_handle()); + std::copy(scale->data(), scale->data() + C, mem_data_handle); + std::copy(shift->data(), shift->data() + C, mem_data_handle + C); + + return mem_p; + } + return scaleshift_memory; } std::shared_ptr AcquireDiffScaleShiftMemory( @@ -115,64 +159,30 @@ template class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - const float momentum = ctx.Attr("momentum"); + auto &dev_ctx = ctx.template device_context(); + const auto &mkldnn_engine = dev_ctx.GetEngine(); + const bool is_test = ctx.Attr("is_test"); const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool fuse_with_relu = ctx.Attr("fuse_with_relu"); const bool trainable_stats = ctx.Attr("trainable_statistics"); - bool test_mode = is_test && (!trainable_stats); - - bool global_stats = test_mode || use_global_stats; - - auto &dev_ctx = ctx.template device_context(); + const bool test_mode = is_test && (!trainable_stats); + const bool global_stats = test_mode || use_global_stats; const auto *x = ctx.Input("X"); const auto *scale = ctx.Input("Scale"); const auto *shift = ctx.Input("Bias"); auto *y = ctx.Output("Y"); - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); auto *batch_mean = ctx.Output("SavedMean"); auto *batch_variance = ctx.Output("SavedVariance"); - PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN, - "Wrong layout set for X tensor"); - PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef, - "Wrong format set for X tensor"); - - auto src_tz = paddle::framework::vectorize(x->dims()); - auto scale_tz = paddle::framework::vectorize(scale->dims()); - PADDLE_ENFORCE_EQ( - scale_tz.size(), 1, - platform::errors::InvalidArgument( - "Dims of scale tensor must be 1, but received scale's size is %d", - scale_tz.size())); - const unsigned int C = scale_tz[0]; - - // MKLDNN requires a single piece of memory for scale and shift/bias data - - std::vector scaleshift_data(scale->data(), scale->data() + C); - scaleshift_data.reserve(2 * C); - scaleshift_data.insert(scaleshift_data.end(), shift->data(), - shift->data() + C); - - // Flags are added by bitwise OR operation - auto flags = mkldnn::normalization_flags::use_scale_shift; // 001 - if (global_stats) - flags |= mkldnn::normalization_flags::use_global_stats; // 010 - if (fuse_with_relu && test_mode) - flags |= mkldnn::normalization_flags::fuse_norm_relu; // 100 - - BatchNormMKLDNNHandler handler( - src_tz, epsilon, flags, global_stats, - platform::MKLDNNFormatForSize(src_tz.size(), x->format()), dev_ctx, - ctx.GetPlace(), ctx.OutputName("SavedMean")); + BatchNormMKLDNNHandler handler(ctx, dev_ctx, mkldnn_engine, + ctx.GetPlace(), x, global_stats, + test_mode, ctx.OutputName("SavedMean")); auto src_memory = handler.AcquireSrcMemory(x); auto scaleshift_memory = - handler.AcquireScaleShiftMemory(scaleshift_data.data()); + handler.AcquireScaleShiftMemory(scale, shift, is_test); auto dst_memory = handler.AcquireDstMemory(y); auto batch_norm_p = handler.AcquireForwardPrimitive(); @@ -206,6 +216,12 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { astream.wait(); if (!global_stats) { + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + const float momentum = ctx.Attr("momentum"); + + const unsigned int C = paddle::framework::vectorize(scale->dims())[0]; + // mkldnn only compute stats for current batch // so we need compute momentum stats via Eigen lib EigenVectorArrayMap batch_mean_e( @@ -273,11 +289,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * C; - std::vector scaleshift_data(scale->data(), scale->data() + C); - scaleshift_data.reserve(scaleshift_size); - scaleshift_data.insert(scaleshift_data.end(), shift->data(), - shift->data() + C); - std::vector diff_scaleshift_data; diff_scaleshift_data.reserve(scaleshift_size); @@ -286,7 +297,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto variance_memory = handler.AcquireVarianceMemory(batch_variance); auto diff_dst_memory = handler.AcquireDiffDstMemory(diff_y); auto scaleshift_memory = - handler.AcquireScaleShiftMemory(scaleshift_data.data()); + handler.AcquireScaleShiftMemory(scale, shift, false); auto diff_src_memory = handler.AcquireDiffSrcMemory(diff_x); auto diff_scaleshift_memory = handler.AcquireDiffScaleShiftMemory(diff_scaleshift_data.data()); diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index 817711f3157..4bfaeb41ee8 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -33,29 +33,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, paddle::platform::errors::PreconditionNotMet( "Operator DNNL LRN must use CPUPlace")); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); auto x = ctx.Input("X"); auto out = ctx.Output("Out"); auto mid = ctx.Output("MidOut"); - const int n = ctx.Attr("n"); - // MKL-DNN implements LRN in a caffe way: - // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html - // Where sum of squares is divided by size of normalization window - // this is not the case for PaddlePaddle LRN. - // Hence we need to compensate for this diffrence by - // multipliing alpha by size of window(n) - const float alpha = ctx.Attr("alpha") * static_cast(n); - const float beta = ctx.Attr("beta"); - const float k = ctx.Attr("k"); - bool is_test = ctx.Attr("is_test"); - - auto dims = paddle::framework::vectorize(x->dims()); - - platform::LRNMKLDNNHandler handler(dims, n, alpha, beta, k, x->format(), - is_test, dev_ctx, ctx.GetPlace(), - ctx.OutputName("Out")); + platform::LRNMKLDNNHandler handler( + ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, ctx.OutputName("Out")); auto src_memory = handler.AcquireSrcMemory(x); auto dst_memory = handler.AcquireDstMemory(out); @@ -77,6 +64,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { // TODO(jczaja): Disable checking mid in unit tests (Require API change) mid->mutable_data(ctx.GetPlace()); auto e_mid = framework::EigenTensor::From(*mid); + const float k = ctx.Attr("k"); e_mid = e_mid.constant(k); mid->set_format(platform::GetMKLDNNFormat(*dst_memory)); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 2d475e7150a..ff42bb41449 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -162,7 +162,7 @@ class MKLDNNHandlerT { std::shared_ptr AcquireMemoryFromPrimitive( mkldnn::memory::desc md, void* ptr, const std::string& suffix) { - auto local_key = key_ + suffix; + const auto local_key = key_ + suffix; auto mem_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { @@ -174,6 +174,24 @@ class MKLDNNHandlerT { return mem_p; } + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::desc md, const std::string& suffix) { + const auto local_key = key_ + suffix; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + if (mem_p == nullptr) { + mem_p = std::make_shared(md, engine_); + dev_ctx_.SetBlob(local_key, mem_p); + } + return mem_p; + } + + std::shared_ptr AcquireMemory(const std::string& suffix) { + const auto local_key = key_ + suffix; + return std::static_pointer_cast( + dev_ctx_.GetBlob(local_key)); + } + const MKLDNNDeviceContext& dev_ctx_; mkldnn::engine engine_; platform::Place place_; @@ -535,21 +553,39 @@ template class LRNMKLDNNHandler : public MKLDNNHandlerT { public: - LRNMKLDNNHandler(const std::vector& dims, const int n, - const float alpha, const float beta, const float k, - const MKLDNNMemoryFormat fmt, bool is_test, + LRNMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, const platform::MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, const std::string& unique_name) + const mkldnn::engine mkldnn_engine, + platform::Place cpu_place, const Tensor* input, + const std::string& unique_name) : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dims, unique_name)) { - auto src_md = - mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); - this->AcquireForwardPrimitiveDescriptor( - is_test ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training, - mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); + dev_ctx, mkldnn_engine, cpu_place, + platform::CreateKey(framework::vectorize(input->dims()), + unique_name)) { + if (!this->isCached()) { + const int n = ctx.Attr("n"); + // MKL-DNN implements LRN in a caffe way: + // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html + // Where sum of squares is divided by size of normalization window + // this is not the case for PaddlePaddle LRN. + // Hence we need to compensate for this diffrence by + // multipliing alpha by size of window(n) + const float alpha = ctx.Attr("alpha") * static_cast(n); + const float beta = ctx.Attr("beta"); + const float k = ctx.Attr("k"); + bool is_test = ctx.Attr("is_test"); + + auto dims = paddle::framework::vectorize(input->dims()); + + auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), + input->format()); + + this->AcquireForwardPrimitiveDescriptor( + is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training, + mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); + } } LRNMKLDNNHandler(const std::vector& dims, const int n, -- GitLab