diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index a42d2913187df5dd2d2bee29bf3372674516717e..e6faeb5e0ff43eb7602c8932678e367eb7a18b56 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -193,7 +193,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, auto reorder_p = handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - mkldnn::stream astream(cpu_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("ext_reorder", platform::EventRole::kUniqueOp); reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index db63481323073401ee13c1b2d0f463a6fdbad169..0ecb6266e4a16cc5d2c22a47c0513da4410a6215 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -48,7 +48,7 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { platform::ReorderMKLDNNHandler handler(tz, dout->type(), dout_type, dev_ctx, onednn_engine, key); - mkldnn::stream astream(onednn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); auto reorder_src_memory_p = handler.AcquireSrcMemory( dout->format(), platform::to_void_cast(dout->data())); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index e679f62a25ac2c2bc134e0e55bbed723b7fcd0bd..8a646e5865d922652f7f01509e7e8bbf06da48ea 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -68,7 +68,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { const auto binary_prim = handler.AcquireForwardPrimitive(); - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); const std::unordered_map args = { {DNNL_ARG_SRC_0, *src_x_memory}, diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc index 1eed49de784089a078b462e7ab0e47a456df3c1b..da811faa41bc765cb65442e2372c30b40458bcfe 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc @@ -246,7 +246,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { memory_p = std::make_shared(this->fwd_pd_->src_iter_desc(), this->engine_); - dnnl::stream astream(this->engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); dnnl::reorder(user_h0_memory, *memory_p, attr_) .execute(astream, user_h0_memory, *memory_p); @@ -284,7 +284,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { memory_p = std::make_shared( this->fwd_pd_->weights_layer_desc(), this->engine_); - dnnl::stream astream(this->engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); dnnl::reorder(user_memory, *memory_p, attr_) .execute(astream, user_memory, *memory_p); @@ -337,7 +337,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { memory_p = std::make_shared( this->fwd_pd_->weights_iter_desc(), this->engine_); - dnnl::stream astream(this->engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); dnnl::reorder(user_memory, *memory_p, attr_) .execute(astream, user_memory, *memory_p); @@ -469,7 +469,7 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel { auto gru_forward_p = handler.AcquireForwardPrimitive(); - dnnl::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); gru_forward_p->execute(astream, gru_args); astream.wait(); diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc index 11711bab81735efd0494d454c33cf5aa0f0274a3..cc3de3ee53c1528a2a7061b07eecca835975f830 100644 --- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc @@ -292,7 +292,7 @@ class MultiGRUHandler { auto gru_forward_p0 = AcquireGruPrimitive(layer, dir); - dnnl::stream astream(engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); gru_forward_p0->execute(astream, gru_args); astream.wait(); return out_mem; @@ -315,7 +315,7 @@ class MultiGRUHandler { memory_p = std::make_shared( gru_pds_[{layer, dir}]->src_iter_desc(), engine_); - dnnl::stream astream(engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); dnnl::reorder(user_h0_memory, *memory_p, attrs_[2 * layer + (dir == R2L)]) .execute(astream, user_h0_memory, *memory_p); @@ -354,7 +354,7 @@ class MultiGRUHandler { memory_p = std::make_shared( gru_pds_[{layer, dir}]->weights_layer_desc(), engine_); - dnnl::stream astream(engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); dnnl::reorder(user_memory, *memory_p, attrs_[2 * layer + (dir == R2L)]) .execute(astream, user_memory, *memory_p); @@ -410,7 +410,7 @@ class MultiGRUHandler { memory_p = std::make_shared( gru_pds_[{layer, dir}]->weights_iter_desc(), engine_); - dnnl::stream astream(engine_); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); dnnl::reorder(user_memory, *memory_p, attrs_[2 * layer + (dir == R2L)]) .execute(astream, user_memory, *memory_p); @@ -516,7 +516,7 @@ class MultiGRUHandler { auto concat_p = AcquireConcatPrimitive(layer); - dnnl::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); concat_p->execute(astream, concat_args); astream.wait(); return out_mem; diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 5c49e87730e1451ef6c3a65a9f38add87dc1f030..49645c330922a1f088471346ffd5f9cf67507ef1 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -112,7 +112,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx, auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y); auto activation_p = handler.AcquireForwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); activation_p->execute(astream, {{MKLDNN_ARG_FROM, *src_memory_p}, {MKLDNN_ARG_TO, *dst_memory_p}}); astream.wait(); @@ -158,7 +158,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx, auto diff_src_memory_p = handler.AcquireDiffSrcMemory(diff_x); auto activation_backward_p = handler.AcquireBackwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); activation_backward_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_p}, diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index e53e052a89c6221e21b536fa8567ae013f5007be..75367ba0573209338b3ba85ab2ac7240f07d58d3 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -220,7 +220,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { y->set_layout(DataLayout::kMKLDNN); y->set_format(platform::GetMKLDNNFormat(*dst_memory)); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); batch_norm_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, {MKLDNN_ARG_SCALE_SHIFT, *scaleshift_memory}, @@ -321,7 +321,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { // finally create batch_norm backward primitive auto batch_norm_bwd_p = handler.AcquireBackwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); batch_norm_bwd_p->execute( astream, {{MKLDNN_ARG_SRC, *src_memory}, {MKLDNN_ARG_MEAN, *mean_memory}, diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 63aa2357beea074c9ab6a11230d50a9ea114863a..4beb7ad017851b513331b0e40dc5a9ee35939716 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -202,7 +202,7 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { output->mutable_data(place, concat_pd->dst_desc().get_size())); } - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); std::unordered_map args; for (size_t i = 0; i < multi_input.size(); ++i) { args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, (*srcs).at(i)}); diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 68fe5828388ee2d89e669d771da9acb06fbcd4d0..67b857aac02380740547cc69c772fb4e7478c07d 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -471,7 +471,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); conv_p->execute(astream, args); astream.wait(); @@ -553,7 +553,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { conv_p = std::static_pointer_cast( dev_ctx.GetBlob(prim_key)); - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (conv_p == nullptr || !is_test) { float fuse_alpha = ctx.Attr("fuse_alpha"); @@ -1045,7 +1045,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { user_weights_md, to_void_cast(filter_data)); auto user_diff_dst_memory_p = handler.AcquireDiffDstMemory( user_diff_dst_md, to_void_cast(output_grad_data)); - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (filter_grad) { auto src_memory_p = handler.AcquireSrcMemoryFromWeightsPrimitive( user_src_memory_p, pipeline); diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 1eb90451a6952944afc3faee335e2a010fb3c2de..f5e62cb44eec4bc49d80035e5a0c6520468e61b8 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -242,7 +242,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto conv_p = handler.AcquireConvolution(); - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (bias) { const T* bias_data = bias->data(); auto user_bias_md = platform::MKLDNNMemDesc( diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc index 8d41b750972352df7c957c6295cab972f3031a2a..0c8ea84296ec085abde9413329cc643df99f7aa8 100644 --- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc @@ -124,7 +124,7 @@ class DeQuantOpKernel : public framework::OpKernel { dst_memory->set_data_handle(output->mutable_data(ctx.GetPlace())); } - mkldnn::stream astream(engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); reorder_p->execute(astream, *src_memory, *dst_memory); astream.wait(); diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 89a24cab5f6745e66b77ba02dd24daeae90aa122..dae9ccd31691ac38409db655c961d1cd3c3eec1d 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -137,7 +137,7 @@ class FCPrimitiveFactory { } void Execute() { - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (bias_) { fc_->execute(astream, {{MKLDNN_ARG_SRC, *input_}, {MKLDNN_ARG_WEIGHTS, *weights_}, @@ -280,7 +280,7 @@ class FCPrimitiveFactory { auto dst_mem = std::make_shared(dst_desc, engine_); auto reorder = mkldnn::reorder(src_mem, *dst_mem); - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { platform::RecordEvent record_reorder("int_reorder", @@ -309,7 +309,7 @@ class FCPrimitiveFactory { attributes.set_output_scales(mask, scale_data); auto reorder = mkldnn::reorder(*src_mem, *dst_mem, attributes); - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc index f7df19ead9921a8a0fe7c1617777a41811f793a2..64a1903c2da4ff5bc5e903ab33124d49bf1b8cdd 100644 --- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc @@ -154,7 +154,7 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { auto resampling_prim = handler.AcquireForwardPrimitive(); const std::unordered_map args = { {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}; - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); resampling_prim->execute(astream, args); astream.wait(); diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc index 65dcb328f20839d4dc9f37e1b7175a4a0245e99e..cc4bfbae2665fe7030dccd48b28c8819164e68c7 100644 --- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc @@ -120,7 +120,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { auto layer_norm_p = handler.AcquireForwardPrimitive(); - dnnl::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); std::unordered_map args; args.insert({DNNL_ARG_SRC, *src_memory}); diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index 9ee653ec58912b326dc44f3f2289bd9dac6b3c62..e2e9d280027b6a30958b308429cbb21d61fb2c08 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -59,7 +59,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto workspace_memory = handler.AcquireWorkspaceMemory(mid); mid->set_layout(framework::DataLayout::kMKLDNN); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (!workspace_memory->get_desc().is_zero()) { mid->set_format(platform::GetMKLDNNFormat(*workspace_memory)); lrn_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, @@ -118,7 +118,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto lrn_bwd = handler.AcquireBackwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); lrn_bwd->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, {MKLDNN_ARG_DIFF_DST, *diff_dst_memory}, {MKLDNN_ARG_DIFF_SRC, *diff_src_memory}, diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index 46d51606d42da8aa6b946bebf506a0e31a485e57..b3d970c7f0513f903f637044de80775db67d7121 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -109,7 +109,7 @@ class MulPrimitiveFactory { auto reorder = mkldnn::reorder(reorder_pd); - mkldnn::stream astream(engine_); + auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); { platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); @@ -184,7 +184,7 @@ class MulPrimitiveFactory { } void Execute() { - mkldnn::stream astream(engine_); + auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); (*mul_).execute(astream, {{MKLDNN_ARG_SRC, *x_input_}, {MKLDNN_ARG_WEIGHTS, *y_input_}, {MKLDNN_ARG_DST, *output_}}); @@ -270,8 +270,7 @@ class MulPrimitiveFactory { auto reorder = mkldnn::reorder(src_mem, dst_mem); - mkldnn::stream astream(engine_); - + auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); { platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); @@ -355,7 +354,7 @@ class MulMKLDNNKernel : public framework::OpKernel { "Operator DNNL Mul must use CPUPlace")); platform::MKLDNNDeviceContext::tls().log_lib_version(); auto &dev_ctx = ctx.template device_context(); - const auto &mkldnn_engine = dev_ctx.GetEngine(); + auto &mkldnn_engine = dev_ctx.GetEngine(); const Tensor *x = ctx.Input("X"); const Tensor *y = ctx.Input("Y"); diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index 9488a1a4405a46e6666d537dd9e441125fbfbaa9..04a4bc91fe43a900a3b2c194787297f215482a9f 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -51,7 +51,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { auto pool_p = handler.AcquireForwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if ((ctx.Attr("is_test") == false) && (ctx.Attr("pooling_type") == "max")) { // Training @@ -154,7 +154,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto pool_bwd_p = handler.AcquireBackwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (pooling_type == "max") { // Max - pooling needs Workspace auto workspace_memory = handler.AcquireWorkspaceMemory(); diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc index 7a03c6ce86d4bde22c15b8e4019b2b9bcec752e3..819c0d15505ca975301e2e37cfaaad4d3c165d2b 100644 --- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc @@ -140,7 +140,7 @@ class QuantOpKernel : public framework::OpKernel { } } - mkldnn::stream astream(engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc index aa74a45e3a575f4078416fd695ccdec055053e09..33422455ada29678fa1ac55ce0d8ec47e3ba720a 100644 --- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc @@ -137,7 +137,7 @@ class ReQuantOpKernel : public framework::OpKernel { } } - dnnl::stream astream(engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index abe0a556536638a702d380f4349f31ff78cb3492..1138d5113929329462a7ea6ccd01f1b7bc375322 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -117,7 +117,7 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { auto softmax_p = handler.AcquireForwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); softmax_p->execute(astream, {{DNNL_ARG_SRC, *softmax_src_memory_p}, {DNNL_ARG_DST, *softmax_dst_memory_p}}); astream.wait(); @@ -169,7 +169,7 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { auto softmax_bwd_p = handler.AcquireBackwardPrimitive(); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); softmax_bwd_p->execute(astream, {{MKLDNN_ARG_DST, *dst_memory_p}, {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_p}, diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index 2b6f959472491e346cf98a2bb238d7d21343a3a5..7618b1d9c31218bf6e15b048801a3bb196a94fce 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -178,7 +178,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { } args.insert({MKLDNN_ARG_DST, *dst_mem}); - mkldnn::stream astream(dev_ctx.GetEngine()); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); sum_p->execute(astream, args); astream.wait(); diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index feda5645b4cfa2bf580cc5bcefbc41d124dd3cc5..4c46a92700996a14ebe06486f207309816eaf98a 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -61,7 +61,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, transpose_src_memory_p); - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); transpose_p->execute(astream, *transpose_src_memory_p, *transpose_dst_memory_p); astream.wait(); @@ -116,7 +116,7 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, transpose_src_memory_p); - mkldnn::stream astream(mkldnn_engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); transpose_p->execute(astream, *transpose_src_memory_p, *transpose_dst_memory_p); astream.wait(); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 4d952ecda0caf7ba94a80459f6e0c051fd0347fb..23690cb879123198376314f0bf264be2b97393b5 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -458,20 +458,34 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } #ifdef PADDLE_WITH_MKLDNN MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) - : CPUDeviceContext(place), - engine_(mkldnn::engine::kind::cpu, 0), - p_blobmap_() { + : CPUDeviceContext(place), p_blobmap_() { p_blobmap_.reset(new BlobMap()); p_mutex_.reset(new std::mutex()); } -MKLDNNDeviceContextThreadLocals::Body::Body() { +MKLDNNDeviceContextThreadLocals::Body::Body() + : cur_engine(mkldnn::engine::kind::cpu, 0), cur_stream(cur_engine) { cur_mkldnn_session_id = kMKLDNNSessionID_Default; cur_input_shape_str = ""; cur_input_shape_cache_capacity = 1; cur_paddle_data_layout = paddle::framework::DataLayout::kNCHW; } +// When Thread finish we clear oneDNN cache +// This is needed when we have one executor used by many threads +// e.g. test_analyzer_detect. Thread ID is not part of caching key +// (for naive executor) so we need to clear cache when one thread finish +// and other is to start inference +// TODO(jczaja): Ideally it would be good to clear only part of cache +// related to thread that is to be terminated +MKLDNNDeviceContextThreadLocals::Body::~Body() { + auto cpu_place = paddle::platform::CPUPlace(); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::MKLDNNDeviceContext* dev_ctx = + (platform::MKLDNNDeviceContext*)pool.Get(cpu_place); + dev_ctx->ResetBlobMap(); +} + void MKLDNNDeviceContextThreadLocals::Body::set_cur_mkldnn_session_id( size_t sid) { cur_mkldnn_session_id = sid; @@ -508,6 +522,14 @@ void MKLDNNDeviceContextThreadLocals::Body::log_lib_version(void) { } } +const mkldnn::engine& MKLDNNDeviceContextThreadLocals::Body::get_engine(void) { + return cur_engine; +} + +mkldnn::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) { + return cur_stream; +} + void MKLDNNDeviceContext::ResetBlobMap() { std::lock_guard lock(*p_mutex_); if (!block_next_cache_clearing_) { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index f058da97b5cfa2358873dea6e3efec997fb40dff..e37a5e18e013605e66ab7d287e209a90c190eb06 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -525,8 +525,12 @@ class MKLDNNDeviceContextThreadLocals { // Recently registered data_format. This is needed to // know for converting MKL-DNN Tensor to non MKL-DNN paddle::framework::DataLayout cur_paddle_data_layout; + // MKL-DNN stream used for execution of primitives (per-thread) + mkldnn::engine cur_engine; + mkldnn::stream cur_stream; Body(); + ~Body(); void set_cur_mkldnn_session_id(size_t sid); size_t get_cur_mkldnn_session_id(void); void set_cur_input_shape_str(std::string input_shape_str); @@ -534,6 +538,8 @@ class MKLDNNDeviceContextThreadLocals { void set_cur_paddle_data_layout(framework::DataLayout dl); framework::DataLayout get_cur_paddle_data_layout(void); void log_lib_version(void); + const mkldnn::engine& get_engine(void); + mkldnn::stream& get_stream(void); }; MKLDNNDeviceContextThreadLocals() = default; MKLDNNDeviceContextThreadLocals(const MKLDNNDeviceContextThreadLocals& c) = @@ -572,7 +578,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext { explicit MKLDNNDeviceContext(CPUPlace place); /* \brief Get the active engine */ - const mkldnn::engine& GetEngine() const { return engine_; } + const mkldnn::engine& GetEngine() const { return tls().get_engine(); } // Remove all entries from the blob map void ResetBlobMap(); @@ -605,7 +611,6 @@ class MKLDNNDeviceContext : public CPUDeviceContext { } private: - mkldnn::engine engine_; std::shared_ptr p_blobmap_; std::shared_ptr p_mutex_; bool block_next_cache_clearing_ = false; diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 37747cd3fd302cc3ceba82e3d3bcea52cac8d5c8..79c536508da123750cb828e9aa38043049a857de 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -188,7 +188,7 @@ MKLDNNGetDataType() { inline void Reorder(mkldnn::memory src, mkldnn::memory dst, const mkldnn::engine& engine) { auto reorder_prim = mkldnn::reorder(src, dst); - mkldnn::stream astream(engine); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); reorder_prim.execute(astream, src, dst); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 55a230cabefaa0bed7ac6658e9acbce9aecd9bb4..37aae14c83a4dbabc5a156b3381e2e9e5a4c9d6c 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -232,7 +232,7 @@ class MKLDNNHandlerT { dev_ctx_.SetBlob(key_reorder_p, reorder_p); } - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); @@ -261,7 +261,7 @@ class MKLDNNHandlerT { std::make_shared(*user_memory_p, *target_memory_p); dev_ctx_.SetBlob(key_reorder_p, reorder_p); - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, @@ -273,7 +273,7 @@ class MKLDNNHandlerT { dev_ctx_.SetBlob(user_key, user_memory_p); dev_ctx_.SetBlob(target_key, target_memory_p); } else if (!is_persistent) { - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); auto user_memory_p = std::static_pointer_cast(dev_ctx_.GetBlob(user_key)); @@ -425,7 +425,7 @@ class MKLDNNHandler { auto reorder_p = std::make_shared(*user_memory_p, *target_memory_p); dev_ctx_.SetBlob(key_reorder_p, reorder_p); - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, @@ -451,7 +451,7 @@ class MKLDNNHandler { auto target_memory_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - mkldnn::stream astream(engine_); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (target_memory_p == nullptr) { target_memory_p = user_memory_p;