From 5f0422f417691f50684bea31863db649628cf23a Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Wed, 4 Aug 2021 15:40:15 +0200 Subject: [PATCH] - Added softmax without caching --- .../operators/mkldnn/softmax_mkldnn_op.cc | 58 ++--- paddle/fluid/platform/mkldnn_reuse.h | 213 ++++++++++++++++++ 2 files changed, 233 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index e065800e4d1..91cc4662072 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -32,50 +32,34 @@ using platform::to_void_cast; template class SoftmaxMKLDNNHandler - : public platform::MKLDNNHandlerT { public: - SoftmaxMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, - const mkldnn::engine mkldnn_engine, + SoftmaxMKLDNNHandler(const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* input, - Tensor* output, const int axis, - const std::string uniq_name, bool is_inplaced) - : platform::MKLDNNHandlerT( - dev_ctx, mkldnn_engine, cpu_place, - // Softmax may be inplace then uniq_name is no longer unique - is_inplaced ? platform::CreateKey( - dev_ctx, framework::vectorize(input->dims()), - axis, uniq_name) - : platform::CreateKey( - dev_ctx, framework::vectorize(input->dims()), - uniq_name)) { - if (!this->isCached()) { - PADDLE_ENFORCE_EQ( - input->dims(), output->dims(), - platform::errors::InvalidArgument( - "The shape of input and output tensor must be identical.")); - - auto softmax_tz = framework::vectorize(input->dims()); - auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), - input->format()); - - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, - axis); - } + Tensor* output, const int axis) + : platform::MKLDNNHandlerNoCachingT( + mkldnn_engine, cpu_place) { + PADDLE_ENFORCE_EQ( + input->dims(), output->dims(), + platform::errors::InvalidArgument( + "The shape of input and output tensor must be identical.")); + + auto softmax_tz = framework::vectorize(input->dims()); + auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), + input->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, axis); } SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* out, const Tensor* out_grad, Tensor* in_x_grad, const std::string& unique_name) : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(out->dims()), - unique_name)) { - if (!this->isBwdCached()) { + dev_ctx, mkldnn_engine, cpu_place) { PADDLE_ENFORCE_EQ( out_grad->dims(), in_x_grad->dims(), platform::errors::InvalidArgument("The shape of softmax_grad's input " @@ -94,7 +78,6 @@ class SoftmaxMKLDNNHandler data_softmax_md, axis); this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, axis); - } } }; @@ -111,9 +94,7 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { const int axis = CanonicalAxis(ctx.Attr("axis"), input->dims().size()); - SoftmaxMKLDNNHandler handler(dev_ctx, mkldnn_engine, ctx.GetPlace(), - input, output, axis, ctx.OutputName("Out"), - is_inplaced); + SoftmaxMKLDNNHandler handler(mkldnn_engine, ctx.GetPlace(), input, output, axis); auto softmax_src_memory_p = handler.AcquireSrcMemory(input); // For Inplace src and and dst are the same memory object @@ -149,11 +130,12 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { paddle::platform::errors::PreconditionNotMet( "Operator DNNL SoftmaxGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); const Tensor* output = ctx.Input("Out"); auto* out_grad = ctx.template Input(framework::GradVarName("Out")); auto* in_x_grad = ctx.template Output(framework::GradVarName("X")); - SoftmaxMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), output, + SoftmaxMKLDNNHandler handler(ctx, mkldnn_engine, ctx.GetPlace(), output, out_grad, in_x_grad, ctx.InputName("Out")); auto dst_memory_p = handler.AcquireDstMemory(output); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index f63d45d7ff6..afdd41f706a 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -34,6 +34,219 @@ using framework::Tensor; using user_function = std::function(const float*)>; using memory = mkldnn::memory; + +template +class MKLDNNHandlerNoCachingT { + public: + MKLDNNHandlerNoCachingT(mkldnn::engine engine, platform::Place cpu_place) + : engine_(engine), + place_(cpu_place), + fwd_pd_(nullptr), + bwd_pd_(nullptr) { + platform::MKLDNNDeviceContext::tls().log_lib_version(); + } + + std::shared_ptr AcquireForwardPrimitive() { + return forward_p = std::make_shared(*fwd_pd_); + } + + std::shared_ptr AcquireBackwardPrimitive() { + return backward_p = std::make_shared(*bwd_pd_); + } + + std::shared_ptr AcquireBackwardWeightsPrimitive() { + PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, platform::errors::Unavailable( + "Error: BWD_PD should be set when " + "getting BWD prim witk key: %s .", + key_p)); + return std::make_shared(*bwd_w_pd_); + } + + std::shared_ptr AcquireSrcMemory( + const framework::Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive( + fwd_pd_->src_desc(), to_void_cast(input_data)); + } + + template + std::shared_ptr AcquireDstMemory(framework::Tensor* output) { + T_out* ptr = + output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr); + } + + template + std::shared_ptr AcquireDstMemory(void) { + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc()); + } + + template + std::shared_ptr AcquireDstMemory( + const framework::Tensor* output) { + const T_out* output_data = output->data(); + return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), + to_void_cast(output_data)); + } + + std::shared_ptr AcquireDiffDstMemory( + const framework::Tensor* diffdst) { + const T* ptr = diffdst->data(); + return this->AcquireMemoryFromPrimitive( + bwd_pd_->diff_dst_desc(), to_void_cast(ptr)); + } + + std::shared_ptr AcquireDiffSrcMemory( + framework::Tensor* diffsrc) { + T* ptr = + diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr); + } + + // Buffer of given Tensor is used for oneDNN computation + std::shared_ptr AcquireDiffWeightsMemory( + framework::Tensor* diff_weights) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "Error: BWD_W_PD should be set when getting BWD grad of weights.")); + T* ptr = diff_weights->mutable_data( + place_, bwd_w_pd_->diff_weights_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), ptr); + } + + // Buffer is allocated by oneDNN to store computation results + std::shared_ptr AcquireDiffWeightsMemory(void) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "Error: BWD_W_PD should be set when getting BWD grad of weights.")); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc()); + } + + protected: + + // If your primitive descriptor requires attributes, pass them as a + // first argument and paramters to descriptor constructor in the following + // arguments. Otherwise, all arguments will be forwarded to descriptor + // constructor, including the first one. + template + void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { + CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); + } + + // Using sfinae to specialise variadic function. Workaround for not having + // if constexpr in C++ 11. + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(args)...); + fwd_pd_ = std::make_shared( + fwd_desc, first, engine_); + } + + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(first), + std::forward(args)...); + fwd_pd_ = + std::make_shared(fwd_desc, engine_); + } + + template + void AcquireBackwardPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, + platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = typename TBackward::desc(std::forward(args)...); + bwd_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + template + void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, + platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = + typename TBackward_params::desc(std::forward(args)...); + bwd_w_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::desc md, void* ptr) { + return std::make_shared(md, engine_, ptr); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::desc md) { + return std::make_shared(md, engine_); + } + + void AcquireReorder(const std::shared_ptr& user_memory_p, + const std::shared_ptr& target_memory_p) { + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } + + template + std::shared_ptr AcquireMemoryWithReorder( + const mkldnn::memory::desc& user_md, + const mkldnn::memory::desc& target_md, void* ptr, + const std::string& suffix, bool is_persistent = false, + std::function(const F*)> custom_reorder_func = {}) { + + std::shared_ptr target_memory_p; + if (custom_reorder_func) { + auto reordered_data = + custom_reorder_func(reinterpret_cast(ptr)); + ptr = reinterpret_cast(reordered_data.get()); + } + auto user_memory_p = + std::make_shared(user_md, engine_, ptr); + if (user_md != target_md) { + target_memory_p = std::make_shared(target_md, engine_); + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } + return target_memory_p; + } + + + mkldnn::engine engine_; + platform::Place place_; + std::shared_ptr fwd_pd_; + std::shared_ptr bwd_pd_; + std::shared_ptr bwd_w_pd_; +}; + template -- GitLab