From 173660be7bcef1184ec6c594000a10384623001c Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Mon, 25 Jan 2021 05:36:30 +0100 Subject: [PATCH] [oneDNN] Cache oneDNN stream not to recreate in each oneDNN op (#30358) --- .../fluid/framework/data_layout_transform.cc | 2 +- .../mkldnn/elementwise_add_mkldnn_op.cc | 2 +- .../mkldnn/elementwise_mkldnn_op.h | 2 +- .../fused/mkldnn/fusion_gru_mkldnn_op.cc | 8 ++--- .../fused/mkldnn/multi_gru_mkldnn_op.cc | 10 +++---- .../operators/mkldnn/activation_mkldnn_op.cc | 4 +-- .../operators/mkldnn/batch_norm_mkldnn_op.cc | 4 +-- .../operators/mkldnn/concat_mkldnn_op.cc | 2 +- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 6 ++-- .../mkldnn/conv_transpose_mkldnn_op.cc | 2 +- .../operators/mkldnn/dequantize_mkldnn_op.cc | 2 +- paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 6 ++-- .../operators/mkldnn/interpolate_mkldnn_op.cc | 2 +- .../operators/mkldnn/layer_norm_mkldnn_op.cc | 2 +- .../fluid/operators/mkldnn/lrn_mkldnn_op.cc | 4 +-- .../fluid/operators/mkldnn/mul_mkldnn_op.cc | 9 +++--- .../fluid/operators/mkldnn/pool_mkldnn_op.cc | 4 +-- .../operators/mkldnn/quantize_mkldnn_op.cc | 2 +- .../operators/mkldnn/requantize_mkldnn_op.cc | 2 +- .../operators/mkldnn/softmax_mkldnn_op.cc | 4 +-- .../fluid/operators/mkldnn/sum_mkldnn_op.cc | 2 +- .../operators/mkldnn/transpose_mkldnn_op.cc | 4 +-- paddle/fluid/platform/device_context.cc | 30 ++++++++++++++++--- paddle/fluid/platform/device_context.h | 9 ++++-- paddle/fluid/platform/mkldnn_helper.h | 2 +- paddle/fluid/platform/mkldnn_reuse.h | 10 +++---- 26 files changed, 81 insertions(+), 55 deletions(-) diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index a42d2913187..e6faeb5e0ff 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -193,7 +193,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, auto reorder_p = handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - mkldnn::stream astream(cpu_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("ext_reorder", platform::EventRole::kUniqueOp); reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index db634813230..0ecb6266e4a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -48,7 +48,7 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { platform::ReorderMKLDNNHandler handler(tz, dout->type(), dout_type, dev_ctx, onednn_engine, key); - mkldnn::stream astream(onednn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); auto reorder_src_memory_p = handler.AcquireSrcMemory( dout->format(), platform::to_void_cast(dout->data())); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index e679f62a25a..8a646e5865d 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -68,7 +68,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { const auto binary_prim = handler.AcquireForwardPrimitive(); - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); const std::unordered_map args = { {DNNL_ARG_SRC_0, *src_x_memory}, diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc index 1eed49de784..da811faa41b 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc @@ -246,7 +246,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { memory_p = std::make_shared(this->fwd_pd_->src_iter_desc(), this->engine_); - dnnl::stream astream(this->engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); dnnl::reorder(user_h0_memory, *memory_p, attr_) .execute(astream, user_h0_memory, *memory_p); @@ -284,7 +284,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { memory_p = std::make_shared( this->fwd_pd_->weights_layer_desc(), this->engine_); - dnnl::stream astream(this->engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); dnnl::reorder(user_memory, *memory_p, attr_) .execute(astream, user_memory, *memory_p); @@ -337,7 +337,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { memory_p = std::make_shared( this->fwd_pd_->weights_iter_desc(), this->engine_); - dnnl::stream astream(this->engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); dnnl::reorder(user_memory, *memory_p, attr_) .execute(astream, user_memory, *memory_p); @@ -469,7 +469,7 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel { auto gru_forward_p = handler.AcquireForwardPrimitive(); - dnnl::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); gru_forward_p->execute(astream, gru_args); astream.wait(); diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc index 11711bab817..cc3de3ee53c 100644 --- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc @@ -292,7 +292,7 @@ class MultiGRUHandler { auto gru_forward_p0 = AcquireGruPrimitive(layer, dir); - dnnl::stream astream(engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); gru_forward_p0->execute(astream, gru_args); astream.wait(); return out_mem; @@ -315,7 +315,7 @@ class MultiGRUHandler { memory_p = std::make_shared( gru_pds_[{layer, dir}]->src_iter_desc(), engine_); - dnnl::stream astream(engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); dnnl::reorder(user_h0_memory, *memory_p, attrs_[2 * layer + (dir == R2L)]) .execute(astream, user_h0_memory, *memory_p); @@ -354,7 +354,7 @@ class MultiGRUHandler { memory_p = std::make_shared( gru_pds_[{layer, dir}]->weights_layer_desc(), engine_); - dnnl::stream astream(engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); dnnl::reorder(user_memory, *memory_p, attrs_[2 * layer + (dir == R2L)]) .execute(astream, user_memory, *memory_p); @@ -410,7 +410,7 @@ class MultiGRUHandler { memory_p = std::make_shared( gru_pds_[{layer, dir}]->weights_iter_desc(), engine_); - dnnl::stream astream(engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); dnnl::reorder(user_memory, *memory_p, attrs_[2 * layer + (dir == R2L)]) .execute(astream, user_memory, *memory_p); @@ -516,7 +516,7 @@ class MultiGRUHandler { auto concat_p = AcquireConcatPrimitive(layer); - dnnl::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); concat_p->execute(astream, concat_args); astream.wait(); return out_mem; diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 5c49e87730e..49645c33092 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -112,7 +112,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx, auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y); auto activation_p = handler.AcquireForwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); activation_p->execute(astream, {{MKLDNN_ARG_FROM, *src_memory_p}, {MKLDNN_ARG_TO, *dst_memory_p}}); astream.wait(); @@ -158,7 +158,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx, auto diff_src_memory_p = handler.AcquireDiffSrcMemory(diff_x); auto activation_backward_p = handler.AcquireBackwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); activation_backward_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_p}, diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index e53e052a89c..75367ba0573 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -220,7 +220,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { y->set_layout(DataLayout::kMKLDNN); y->set_format(platform::GetMKLDNNFormat(*dst_memory)); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); batch_norm_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, {MKLDNN_ARG_SCALE_SHIFT, *scaleshift_memory}, @@ -321,7 +321,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { // finally create batch_norm backward primitive auto batch_norm_bwd_p = handler.AcquireBackwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); batch_norm_bwd_p->execute( astream, {{MKLDNN_ARG_SRC, *src_memory}, {MKLDNN_ARG_MEAN, *mean_memory}, diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 63aa2357bee..4beb7ad0178 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -202,7 +202,7 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { output->mutable_data(place, concat_pd->dst_desc().get_size())); } - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); std::unordered_map args; for (size_t i = 0; i < multi_input.size(); ++i) { args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, (*srcs).at(i)}); diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 68fe5828388..67b857aac02 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -471,7 +471,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); conv_p->execute(astream, args); astream.wait(); @@ -553,7 +553,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { conv_p = std::static_pointer_cast( dev_ctx.GetBlob(prim_key)); - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (conv_p == nullptr || !is_test) { float fuse_alpha = ctx.Attr("fuse_alpha"); @@ -1045,7 +1045,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { user_weights_md, to_void_cast(filter_data)); auto user_diff_dst_memory_p = handler.AcquireDiffDstMemory( user_diff_dst_md, to_void_cast(output_grad_data)); - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (filter_grad) { auto src_memory_p = handler.AcquireSrcMemoryFromWeightsPrimitive( user_src_memory_p, pipeline); diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 1eb90451a69..f5e62cb44ee 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -242,7 +242,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto conv_p = handler.AcquireConvolution(); - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (bias) { const T* bias_data = bias->data(); auto user_bias_md = platform::MKLDNNMemDesc( diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc index 8d41b750972..0c8ea84296e 100644 --- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc @@ -124,7 +124,7 @@ class DeQuantOpKernel : public framework::OpKernel { dst_memory->set_data_handle(output->mutable_data(ctx.GetPlace())); } - mkldnn::stream astream(engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); reorder_p->execute(astream, *src_memory, *dst_memory); astream.wait(); diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 89a24cab5f6..dae9ccd3169 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -137,7 +137,7 @@ class FCPrimitiveFactory { } void Execute() { - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (bias_) { fc_->execute(astream, {{MKLDNN_ARG_SRC, *input_}, {MKLDNN_ARG_WEIGHTS, *weights_}, @@ -280,7 +280,7 @@ class FCPrimitiveFactory { auto dst_mem = std::make_shared(dst_desc, engine_); auto reorder = mkldnn::reorder(src_mem, *dst_mem); - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { platform::RecordEvent record_reorder("int_reorder", @@ -309,7 +309,7 @@ class FCPrimitiveFactory { attributes.set_output_scales(mask, scale_data); auto reorder = mkldnn::reorder(*src_mem, *dst_mem, attributes); - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc index f7df19ead99..64a1903c2da 100644 --- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc @@ -154,7 +154,7 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { auto resampling_prim = handler.AcquireForwardPrimitive(); const std::unordered_map args = { {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}; - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); resampling_prim->execute(astream, args); astream.wait(); diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc index 65dcb328f20..cc4bfbae266 100644 --- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc @@ -120,7 +120,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { auto layer_norm_p = handler.AcquireForwardPrimitive(); - dnnl::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); std::unordered_map args; args.insert({DNNL_ARG_SRC, *src_memory}); diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index 9ee653ec589..e2e9d280027 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -59,7 +59,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto workspace_memory = handler.AcquireWorkspaceMemory(mid); mid->set_layout(framework::DataLayout::kMKLDNN); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (!workspace_memory->get_desc().is_zero()) { mid->set_format(platform::GetMKLDNNFormat(*workspace_memory)); lrn_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, @@ -118,7 +118,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto lrn_bwd = handler.AcquireBackwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); lrn_bwd->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, {MKLDNN_ARG_DIFF_DST, *diff_dst_memory}, {MKLDNN_ARG_DIFF_SRC, *diff_src_memory}, diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index 46d51606d42..b3d970c7f05 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -109,7 +109,7 @@ class MulPrimitiveFactory { auto reorder = mkldnn::reorder(reorder_pd); - mkldnn::stream astream(engine_); + auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); { platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); @@ -184,7 +184,7 @@ class MulPrimitiveFactory { } void Execute() { - mkldnn::stream astream(engine_); + auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); (*mul_).execute(astream, {{MKLDNN_ARG_SRC, *x_input_}, {MKLDNN_ARG_WEIGHTS, *y_input_}, {MKLDNN_ARG_DST, *output_}}); @@ -270,8 +270,7 @@ class MulPrimitiveFactory { auto reorder = mkldnn::reorder(src_mem, dst_mem); - mkldnn::stream astream(engine_); - + auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); { platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); @@ -355,7 +354,7 @@ class MulMKLDNNKernel : public framework::OpKernel { "Operator DNNL Mul must use CPUPlace")); platform::MKLDNNDeviceContext::tls().log_lib_version(); auto &dev_ctx = ctx.template device_context(); - const auto &mkldnn_engine = dev_ctx.GetEngine(); + auto &mkldnn_engine = dev_ctx.GetEngine(); const Tensor *x = ctx.Input("X"); const Tensor *y = ctx.Input("Y"); diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index 9488a1a4405..04a4bc91fe4 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -51,7 +51,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { auto pool_p = handler.AcquireForwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if ((ctx.Attr("is_test") == false) && (ctx.Attr("pooling_type") == "max")) { // Training @@ -154,7 +154,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto pool_bwd_p = handler.AcquireBackwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (pooling_type == "max") { // Max - pooling needs Workspace auto workspace_memory = handler.AcquireWorkspaceMemory(); diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc index 7a03c6ce86d..819c0d15505 100644 --- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc @@ -140,7 +140,7 @@ class QuantOpKernel : public framework::OpKernel { } } - mkldnn::stream astream(engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc index aa74a45e3a5..33422455ada 100644 --- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc @@ -137,7 +137,7 @@ class ReQuantOpKernel : public framework::OpKernel { } } - dnnl::stream astream(engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index abe0a556536..1138d511392 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -117,7 +117,7 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { auto softmax_p = handler.AcquireForwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); softmax_p->execute(astream, {{DNNL_ARG_SRC, *softmax_src_memory_p}, {DNNL_ARG_DST, *softmax_dst_memory_p}}); astream.wait(); @@ -169,7 +169,7 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { auto softmax_bwd_p = handler.AcquireBackwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); softmax_bwd_p->execute(astream, {{MKLDNN_ARG_DST, *dst_memory_p}, {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_p}, diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index 2b6f9594724..7618b1d9c31 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -178,7 +178,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { } args.insert({MKLDNN_ARG_DST, *dst_mem}); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); sum_p->execute(astream, args); astream.wait(); diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index feda5645b4c..4c46a927009 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -61,7 +61,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, transpose_src_memory_p); - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); transpose_p->execute(astream, *transpose_src_memory_p, *transpose_dst_memory_p); astream.wait(); @@ -116,7 +116,7 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, transpose_src_memory_p); - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); transpose_p->execute(astream, *transpose_src_memory_p, *transpose_dst_memory_p); astream.wait(); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 4d952ecda0c..23690cb8791 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -458,20 +458,34 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } #ifdef PADDLE_WITH_MKLDNN MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) - : CPUDeviceContext(place), - engine_(mkldnn::engine::kind::cpu, 0), - p_blobmap_() { + : CPUDeviceContext(place), p_blobmap_() { p_blobmap_.reset(new BlobMap()); p_mutex_.reset(new std::mutex()); } -MKLDNNDeviceContextThreadLocals::Body::Body() { +MKLDNNDeviceContextThreadLocals::Body::Body() + : cur_engine(mkldnn::engine::kind::cpu, 0), cur_stream(cur_engine) { cur_mkldnn_session_id = kMKLDNNSessionID_Default; cur_input_shape_str = ""; cur_input_shape_cache_capacity = 1; cur_paddle_data_layout = paddle::framework::DataLayout::kNCHW; } +// When Thread finish we clear oneDNN cache +// This is needed when we have one executor used by many threads +// e.g. test_analyzer_detect. Thread ID is not part of caching key +// (for naive executor) so we need to clear cache when one thread finish +// and other is to start inference +// TODO(jczaja): Ideally it would be good to clear only part of cache +// related to thread that is to be terminated +MKLDNNDeviceContextThreadLocals::Body::~Body() { + auto cpu_place = paddle::platform::CPUPlace(); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::MKLDNNDeviceContext* dev_ctx = + (platform::MKLDNNDeviceContext*)pool.Get(cpu_place); + dev_ctx->ResetBlobMap(); +} + void MKLDNNDeviceContextThreadLocals::Body::set_cur_mkldnn_session_id( size_t sid) { cur_mkldnn_session_id = sid; @@ -508,6 +522,14 @@ void MKLDNNDeviceContextThreadLocals::Body::log_lib_version(void) { } } +const mkldnn::engine& MKLDNNDeviceContextThreadLocals::Body::get_engine(void) { + return cur_engine; +} + +mkldnn::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) { + return cur_stream; +} + void MKLDNNDeviceContext::ResetBlobMap() { std::lock_guard lock(*p_mutex_); if (!block_next_cache_clearing_) { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index f058da97b5c..e37a5e18e01 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -525,8 +525,12 @@ class MKLDNNDeviceContextThreadLocals { // Recently registered data_format. This is needed to // know for converting MKL-DNN Tensor to non MKL-DNN paddle::framework::DataLayout cur_paddle_data_layout; + // MKL-DNN stream used for execution of primitives (per-thread) + mkldnn::engine cur_engine; + mkldnn::stream cur_stream; Body(); + ~Body(); void set_cur_mkldnn_session_id(size_t sid); size_t get_cur_mkldnn_session_id(void); void set_cur_input_shape_str(std::string input_shape_str); @@ -534,6 +538,8 @@ class MKLDNNDeviceContextThreadLocals { void set_cur_paddle_data_layout(framework::DataLayout dl); framework::DataLayout get_cur_paddle_data_layout(void); void log_lib_version(void); + const mkldnn::engine& get_engine(void); + mkldnn::stream& get_stream(void); }; MKLDNNDeviceContextThreadLocals() = default; MKLDNNDeviceContextThreadLocals(const MKLDNNDeviceContextThreadLocals& c) = @@ -572,7 +578,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext { explicit MKLDNNDeviceContext(CPUPlace place); /* \brief Get the active engine */ - const mkldnn::engine& GetEngine() const { return engine_; } + const mkldnn::engine& GetEngine() const { return tls().get_engine(); } // Remove all entries from the blob map void ResetBlobMap(); @@ -605,7 +611,6 @@ class MKLDNNDeviceContext : public CPUDeviceContext { } private: - mkldnn::engine engine_; std::shared_ptr p_blobmap_; std::shared_ptr p_mutex_; bool block_next_cache_clearing_ = false; diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 37747cd3fd3..79c536508da 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -188,7 +188,7 @@ MKLDNNGetDataType() { inline void Reorder(mkldnn::memory src, mkldnn::memory dst, const mkldnn::engine& engine) { auto reorder_prim = mkldnn::reorder(src, dst); - mkldnn::stream astream(engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); reorder_prim.execute(astream, src, dst); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 55a230cabef..37aae14c83a 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -232,7 +232,7 @@ class MKLDNNHandlerT { dev_ctx_.SetBlob(key_reorder_p, reorder_p); } - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); @@ -261,7 +261,7 @@ class MKLDNNHandlerT { std::make_shared(*user_memory_p, *target_memory_p); dev_ctx_.SetBlob(key_reorder_p, reorder_p); - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, @@ -273,7 +273,7 @@ class MKLDNNHandlerT { dev_ctx_.SetBlob(user_key, user_memory_p); dev_ctx_.SetBlob(target_key, target_memory_p); } else if (!is_persistent) { - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); auto user_memory_p = std::static_pointer_cast(dev_ctx_.GetBlob(user_key)); @@ -425,7 +425,7 @@ class MKLDNNHandler { auto reorder_p = std::make_shared(*user_memory_p, *target_memory_p); dev_ctx_.SetBlob(key_reorder_p, reorder_p); - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, @@ -451,7 +451,7 @@ class MKLDNNHandler { auto target_memory_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (target_memory_p == nullptr) { target_memory_p = user_memory_p; -- GitLab