diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index a42d2913187df5dd2d2bee29bf3372674516717e..e6faeb5e0ff43eb7602c8932678e367eb7a18b56 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -193,7 +193,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
     auto reorder_p =
         handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
 
-    mkldnn::stream astream(cpu_engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     platform::RecordEvent record_reorder("ext_reorder",
                                          platform::EventRole::kUniqueOp);
     reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index db63481323073401ee13c1b2d0f463a6fdbad169..0ecb6266e4a16cc5d2c22a47c0513da4410a6215 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -48,7 +48,7 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
     platform::ReorderMKLDNNHandler handler(tz, dout->type(), dout_type, dev_ctx,
                                            onednn_engine, key);
 
-    mkldnn::stream astream(onednn_engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     auto reorder_src_memory_p = handler.AcquireSrcMemory(
         dout->format(), platform::to_void_cast(dout->data<T>()));
 
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index e679f62a25ac2c2bc134e0e55bbed723b7fcd0bd..8a646e5865d922652f7f01509e7e8bbf06da48ea 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -68,7 +68,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
 
     const auto binary_prim = handler.AcquireForwardPrimitive();
 
-    mkldnn::stream astream(mkldnn_engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
     const std::unordered_map<int, dnnl::memory> args = {
         {DNNL_ARG_SRC_0, *src_x_memory},
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index 1eed49de784089a078b462e7ab0e47a456df3c1b..da811faa41bc765cb65442e2372c30b40458bcfe 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -246,7 +246,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
       memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
                                                 this->engine_);
 
-      dnnl::stream astream(this->engine_);
+      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
       dnnl::reorder(user_h0_memory, *memory_p, attr_)
           .execute(astream, user_h0_memory, *memory_p);
 
@@ -284,7 +284,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
       memory_p = std::make_shared<dnnl::memory>(
           this->fwd_pd_->weights_layer_desc(), this->engine_);
 
-      dnnl::stream astream(this->engine_);
+      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
       dnnl::reorder(user_memory, *memory_p, attr_)
           .execute(astream, user_memory, *memory_p);
 
@@ -337,7 +337,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
       memory_p = std::make_shared<dnnl::memory>(
           this->fwd_pd_->weights_iter_desc(), this->engine_);
 
-      dnnl::stream astream(this->engine_);
+      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
       dnnl::reorder(user_memory, *memory_p, attr_)
           .execute(astream, user_memory, *memory_p);
 
@@ -469,7 +469,7 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
 
     auto gru_forward_p = handler.AcquireForwardPrimitive();
 
-    dnnl::stream astream(mkldnn_engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     gru_forward_p->execute(astream, gru_args);
     astream.wait();
 
diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
index 11711bab81735efd0494d454c33cf5aa0f0274a3..cc3de3ee53c1528a2a7061b07eecca835975f830 100644
--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
@@ -292,7 +292,7 @@ class MultiGRUHandler {
 
     auto gru_forward_p0 = AcquireGruPrimitive(layer, dir);
 
-    dnnl::stream astream(engine_);
+    auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
     gru_forward_p0->execute(astream, gru_args);
     astream.wait();
     return out_mem;
@@ -315,7 +315,7 @@ class MultiGRUHandler {
       memory_p = std::make_shared<dnnl::memory>(
           gru_pds_[{layer, dir}]->src_iter_desc(), engine_);
 
-      dnnl::stream astream(engine_);
+      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
       dnnl::reorder(user_h0_memory, *memory_p, attrs_[2 * layer + (dir == R2L)])
           .execute(astream, user_h0_memory, *memory_p);
 
@@ -354,7 +354,7 @@ class MultiGRUHandler {
       memory_p = std::make_shared<dnnl::memory>(
           gru_pds_[{layer, dir}]->weights_layer_desc(), engine_);
 
-      dnnl::stream astream(engine_);
+      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
       dnnl::reorder(user_memory, *memory_p, attrs_[2 * layer + (dir == R2L)])
           .execute(astream, user_memory, *memory_p);
 
@@ -410,7 +410,7 @@ class MultiGRUHandler {
       memory_p = std::make_shared<dnnl::memory>(
           gru_pds_[{layer, dir}]->weights_iter_desc(), engine_);
 
-      dnnl::stream astream(engine_);
+      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
       dnnl::reorder(user_memory, *memory_p, attrs_[2 * layer + (dir == R2L)])
           .execute(astream, user_memory, *memory_p);
 
@@ -516,7 +516,7 @@ class MultiGRUHandler {
 
     auto concat_p = AcquireConcatPrimitive(layer);
 
-    dnnl::stream astream(engine_);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     concat_p->execute(astream, concat_args);
     astream.wait();
     return out_mem;
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 5c49e87730e1451ef6c3a65a9f38add87dc1f030..49645c330922a1f088471346ffd5f9cf67507ef1 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -112,7 +112,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y);
   auto activation_p = handler.AcquireForwardPrimitive();
 
-  mkldnn::stream astream(dev_ctx.GetEngine());
+  auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
   activation_p->execute(astream, {{MKLDNN_ARG_FROM, *src_memory_p},
                                   {MKLDNN_ARG_TO, *dst_memory_p}});
   astream.wait();
@@ -158,7 +158,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
   auto diff_src_memory_p = handler.AcquireDiffSrcMemory(diff_x);
   auto activation_backward_p = handler.AcquireBackwardPrimitive();
 
-  mkldnn::stream astream(dev_ctx.GetEngine());
+  auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
   activation_backward_p->execute(astream,
                                  {{MKLDNN_ARG_SRC, *src_memory_p},
                                   {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_p},
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index e53e052a89c6221e21b536fa8567ae013f5007be..75367ba0573209338b3ba85ab2ac7240f07d58d3 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -220,7 +220,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     y->set_layout(DataLayout::kMKLDNN);
     y->set_format(platform::GetMKLDNNFormat(*dst_memory));
 
-    mkldnn::stream astream(dev_ctx.GetEngine());
+    auto &astream = platform::MKLDNNDeviceContext::tls().get_stream();
     batch_norm_p->execute(astream,
                           {{MKLDNN_ARG_SRC, *src_memory},
                            {MKLDNN_ARG_SCALE_SHIFT, *scaleshift_memory},
@@ -321,7 +321,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     // finally create batch_norm backward primitive
     auto batch_norm_bwd_p = handler.AcquireBackwardPrimitive();
 
-    mkldnn::stream astream(dev_ctx.GetEngine());
+    auto &astream = platform::MKLDNNDeviceContext::tls().get_stream();
     batch_norm_bwd_p->execute(
         astream, {{MKLDNN_ARG_SRC, *src_memory},
                   {MKLDNN_ARG_MEAN, *mean_memory},
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 63aa2357beea074c9ab6a11230d50a9ea114863a..4beb7ad017851b513331b0e40dc5a9ee35939716 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -202,7 +202,7 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           output->mutable_data<T>(place, concat_pd->dst_desc().get_size()));
     }
 
-    mkldnn::stream astream(mkldnn_engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     std::unordered_map<int, memory> args;
     for (size_t i = 0; i < multi_input.size(); ++i) {
       args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, (*srcs).at(i)});
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 68fe5828388ee2d89e669d771da9acb06fbcd4d0..67b857aac02380740547cc69c772fb4e7478c07d 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -471,7 +471,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       args.insert({MKLDNN_ARG_BIAS, *bias_memory_p});
     }
 
-    mkldnn::stream astream(mkldnn_engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     conv_p->execute(astream, args);
     astream.wait();
 
@@ -553,7 +553,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
         dev_ctx.GetBlob(prim_key));
 
-    mkldnn::stream astream(mkldnn_engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
     if (conv_p == nullptr || !is_test) {
       float fuse_alpha = ctx.Attr<float>("fuse_alpha");
@@ -1045,7 +1045,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         user_weights_md, to_void_cast<T>(filter_data));
     auto user_diff_dst_memory_p = handler.AcquireDiffDstMemory(
         user_diff_dst_md, to_void_cast<T>(output_grad_data));
-    mkldnn::stream astream(mkldnn_engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     if (filter_grad) {
       auto src_memory_p = handler.AcquireSrcMemoryFromWeightsPrimitive(
           user_src_memory_p, pipeline);
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 1eb90451a6952944afc3faee335e2a010fb3c2de..f5e62cb44eec4bc49d80035e5a0c6520468e61b8 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -242,7 +242,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto conv_p = handler.AcquireConvolution();
 
-    mkldnn::stream astream(mkldnn_engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     if (bias) {
       const T* bias_data = bias->data<T>();
       auto user_bias_md = platform::MKLDNNMemDesc(
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index 8d41b750972352df7c957c6295cab972f3031a2a..0c8ea84296ec085abde9413329cc643df99f7aa8 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -124,7 +124,7 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
       dst_memory->set_data_handle(output->mutable_data<float>(ctx.GetPlace()));
     }
 
-    mkldnn::stream astream(engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     reorder_p->execute(astream, *src_memory, *dst_memory);
     astream.wait();
 
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 89a24cab5f6745e66b77ba02dd24daeae90aa122..dae9ccd31691ac38409db655c961d1cd3c3eec1d 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -137,7 +137,7 @@ class FCPrimitiveFactory {
   }
 
   void Execute() {
-    mkldnn::stream astream(engine_);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     if (bias_) {
       fc_->execute(astream, {{MKLDNN_ARG_SRC, *input_},
                              {MKLDNN_ARG_WEIGHTS, *weights_},
@@ -280,7 +280,7 @@ class FCPrimitiveFactory {
     auto dst_mem = std::make_shared<memory>(dst_desc, engine_);
 
     auto reorder = mkldnn::reorder(src_mem, *dst_mem);
-    mkldnn::stream astream(engine_);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
     {
       platform::RecordEvent record_reorder("int_reorder",
@@ -309,7 +309,7 @@ class FCPrimitiveFactory {
     attributes.set_output_scales(mask, scale_data);
     auto reorder = mkldnn::reorder(*src_mem, *dst_mem, attributes);
 
-    mkldnn::stream astream(engine_);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     {
       platform::RecordEvent record_reorder("int_reorder",
                                            platform::EventRole::kUniqueOp);
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index f7df19ead9921a8a0fe7c1617777a41811f793a2..64a1903c2da4ff5bc5e903ab33124d49bf1b8cdd 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -154,7 +154,7 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
     auto resampling_prim = handler.AcquireForwardPrimitive();
     const std::unordered_map<int, dnnl::memory> args = {
         {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
-    mkldnn::stream astream(mkldnn_engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     resampling_prim->execute(astream, args);
     astream.wait();
 
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index 65dcb328f20839d4dc9f37e1b7175a4a0245e99e..cc4bfbae2665fe7030dccd48b28c8819164e68c7 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -120,7 +120,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto layer_norm_p = handler.AcquireForwardPrimitive();
 
-    dnnl::stream astream(dev_ctx.GetEngine());
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     std::unordered_map<int, dnnl::memory> args;
 
     args.insert({DNNL_ARG_SRC, *src_memory});
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index 9ee653ec58912b326dc44f3f2289bd9dac6b3c62..e2e9d280027b6a30958b308429cbb21d61fb2c08 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -59,7 +59,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto workspace_memory = handler.AcquireWorkspaceMemory(mid);
     mid->set_layout(framework::DataLayout::kMKLDNN);
 
-    mkldnn::stream astream(dev_ctx.GetEngine());
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     if (!workspace_memory->get_desc().is_zero()) {
       mid->set_format(platform::GetMKLDNNFormat(*workspace_memory));
       lrn_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory},
@@ -118,7 +118,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     auto lrn_bwd = handler.AcquireBackwardPrimitive();
 
-    mkldnn::stream astream(dev_ctx.GetEngine());
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     lrn_bwd->execute(astream, {{MKLDNN_ARG_SRC, *src_memory},
                                {MKLDNN_ARG_DIFF_DST, *diff_dst_memory},
                                {MKLDNN_ARG_DIFF_SRC, *diff_src_memory},
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
index 46d51606d42da8aa6b946bebf506a0e31a485e57..b3d970c7f0513f903f637044de80775db67d7121 100644
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -109,7 +109,7 @@ class MulPrimitiveFactory {
 
     auto reorder = mkldnn::reorder(reorder_pd);
 
-    mkldnn::stream astream(engine_);
+    auto &astream = platform::MKLDNNDeviceContext::tls().get_stream();
     {
       platform::RecordEvent record_reorder("int_reorder",
                                            platform::EventRole::kUniqueOp);
@@ -184,7 +184,7 @@ class MulPrimitiveFactory {
   }
 
   void Execute() {
-    mkldnn::stream astream(engine_);
+    auto &astream = platform::MKLDNNDeviceContext::tls().get_stream();
     (*mul_).execute(astream, {{MKLDNN_ARG_SRC, *x_input_},
                               {MKLDNN_ARG_WEIGHTS, *y_input_},
                               {MKLDNN_ARG_DST, *output_}});
@@ -270,8 +270,7 @@ class MulPrimitiveFactory {
 
     auto reorder = mkldnn::reorder(src_mem, dst_mem);
 
-    mkldnn::stream astream(engine_);
-
+    auto &astream = platform::MKLDNNDeviceContext::tls().get_stream();
     {
       platform::RecordEvent record_reorder("int_reorder",
                                            platform::EventRole::kUniqueOp);
@@ -355,7 +354,7 @@ class MulMKLDNNKernel : public framework::OpKernel<XT> {
                           "Operator DNNL Mul must use CPUPlace"));
     platform::MKLDNNDeviceContext::tls().log_lib_version();
     auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const auto &mkldnn_engine = dev_ctx.GetEngine();
+    auto &mkldnn_engine = dev_ctx.GetEngine();
 
     const Tensor *x = ctx.Input<Tensor>("X");
     const Tensor *y = ctx.Input<Tensor>("Y");
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 9488a1a4405a46e6666d537dd9e441125fbfbaa9..04a4bc91fe43a900a3b2c194787297f215482a9f 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -51,7 +51,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto pool_p = handler.AcquireForwardPrimitive();
 
-    mkldnn::stream astream(dev_ctx.GetEngine());
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     if ((ctx.Attr<bool>("is_test") == false) &&
         (ctx.Attr<std::string>("pooling_type") == "max")) {
       // Training
@@ -154,7 +154,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     auto pool_bwd_p = handler.AcquireBackwardPrimitive();
 
-    mkldnn::stream astream(dev_ctx.GetEngine());
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     if (pooling_type == "max") {
       // Max - pooling needs Workspace
       auto workspace_memory = handler.AcquireWorkspaceMemory();
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 7a03c6ce86d4bde22c15b8e4019b2b9bcec752e3..819c0d15505ca975301e2e37cfaaad4d3c165d2b 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -140,7 +140,7 @@ class QuantOpKernel : public framework::OpKernel<T> {
       }
     }
 
-    mkldnn::stream astream(engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     {
       platform::RecordEvent record_reorder("int_reorder",
                                            platform::EventRole::kUniqueOp);
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
index aa74a45e3a575f4078416fd695ccdec055053e09..33422455ada29678fa1ac55ce0d8ec47e3ba720a 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -137,7 +137,7 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
       }
     }
 
-    dnnl::stream astream(engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     {
       platform::RecordEvent record_reorder("int_reorder",
                                            platform::EventRole::kUniqueOp);
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index abe0a556536638a702d380f4349f31ff78cb3492..1138d5113929329462a7ea6ccd01f1b7bc375322 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -117,7 +117,7 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
 
     auto softmax_p = handler.AcquireForwardPrimitive();
 
-    mkldnn::stream astream(dev_ctx.GetEngine());
+    auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
     softmax_p->execute(astream, {{DNNL_ARG_SRC, *softmax_src_memory_p},
                                  {DNNL_ARG_DST, *softmax_dst_memory_p}});
     astream.wait();
@@ -169,7 +169,7 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
 
     auto softmax_bwd_p = handler.AcquireBackwardPrimitive();
 
-    mkldnn::stream astream(dev_ctx.GetEngine());
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     softmax_bwd_p->execute(astream,
                            {{MKLDNN_ARG_DST, *dst_memory_p},
                             {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_p},
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 2b6f959472491e346cf98a2bb238d7d21343a3a5..7618b1d9c31218bf6e15b048801a3bb196a94fce 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -178,7 +178,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     }
     args.insert({MKLDNN_ARG_DST, *dst_mem});
 
-    mkldnn::stream astream(dev_ctx.GetEngine());
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     sum_p->execute(astream, args);
     astream.wait();
 
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index feda5645b4cfa2bf580cc5bcefbc41d124dd3cc5..4c46a92700996a14ebe06486f207309816eaf98a 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -61,7 +61,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
                                                 transpose_src_memory_p);
 
-    mkldnn::stream astream(mkldnn_engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     transpose_p->execute(astream, *transpose_src_memory_p,
                          *transpose_dst_memory_p);
     astream.wait();
@@ -116,7 +116,7 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
                                                 transpose_src_memory_p);
 
-    mkldnn::stream astream(mkldnn_engine);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     transpose_p->execute(astream, *transpose_src_memory_p,
                          *transpose_dst_memory_p);
     astream.wait();
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 4d952ecda0caf7ba94a80459f6e0c051fd0347fb..23690cb879123198376314f0bf264be2b97393b5 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -458,20 +458,34 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_MKLDNN
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
-    : CPUDeviceContext(place),
-      engine_(mkldnn::engine::kind::cpu, 0),
-      p_blobmap_() {
+    : CPUDeviceContext(place), p_blobmap_() {
   p_blobmap_.reset(new BlobMap());
   p_mutex_.reset(new std::mutex());
 }
 
-MKLDNNDeviceContextThreadLocals::Body::Body() {
+MKLDNNDeviceContextThreadLocals::Body::Body()
+    : cur_engine(mkldnn::engine::kind::cpu, 0), cur_stream(cur_engine) {
   cur_mkldnn_session_id = kMKLDNNSessionID_Default;
   cur_input_shape_str = "";
   cur_input_shape_cache_capacity = 1;
   cur_paddle_data_layout = paddle::framework::DataLayout::kNCHW;
 }
 
+// When Thread finish we clear oneDNN cache
+// This is needed when we have one executor used by many threads
+// e.g. test_analyzer_detect. Thread ID is not part of caching key
+// (for naive executor) so we need to clear cache when one thread finish
+// and other is to start inference
+// TODO(jczaja): Ideally it would be good to clear only part of cache
+// related to thread that is to be terminated
+MKLDNNDeviceContextThreadLocals::Body::~Body() {
+  auto cpu_place = paddle::platform::CPUPlace();
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  platform::MKLDNNDeviceContext* dev_ctx =
+      (platform::MKLDNNDeviceContext*)pool.Get(cpu_place);
+  dev_ctx->ResetBlobMap();
+}
+
 void MKLDNNDeviceContextThreadLocals::Body::set_cur_mkldnn_session_id(
     size_t sid) {
   cur_mkldnn_session_id = sid;
@@ -508,6 +522,14 @@ void MKLDNNDeviceContextThreadLocals::Body::log_lib_version(void) {
   }
 }
 
+const mkldnn::engine& MKLDNNDeviceContextThreadLocals::Body::get_engine(void) {
+  return cur_engine;
+}
+
+mkldnn::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) {
+  return cur_stream;
+}
+
 void MKLDNNDeviceContext::ResetBlobMap() {
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
   if (!block_next_cache_clearing_) {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index f058da97b5cfa2358873dea6e3efec997fb40dff..e37a5e18e013605e66ab7d287e209a90c190eb06 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -525,8 +525,12 @@ class MKLDNNDeviceContextThreadLocals {
     // Recently registered data_format. This is needed to
     // know for converting MKL-DNN Tensor to non MKL-DNN
     paddle::framework::DataLayout cur_paddle_data_layout;
+    // MKL-DNN stream used for execution of primitives (per-thread)
+    mkldnn::engine cur_engine;
+    mkldnn::stream cur_stream;
 
     Body();
+    ~Body();
     void set_cur_mkldnn_session_id(size_t sid);
     size_t get_cur_mkldnn_session_id(void);
     void set_cur_input_shape_str(std::string input_shape_str);
@@ -534,6 +538,8 @@ class MKLDNNDeviceContextThreadLocals {
     void set_cur_paddle_data_layout(framework::DataLayout dl);
     framework::DataLayout get_cur_paddle_data_layout(void);
     void log_lib_version(void);
+    const mkldnn::engine& get_engine(void);
+    mkldnn::stream& get_stream(void);
   };
   MKLDNNDeviceContextThreadLocals() = default;
   MKLDNNDeviceContextThreadLocals(const MKLDNNDeviceContextThreadLocals& c) =
@@ -572,7 +578,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   explicit MKLDNNDeviceContext(CPUPlace place);
 
   /* \brief  Get the active engine */
-  const mkldnn::engine& GetEngine() const { return engine_; }
+  const mkldnn::engine& GetEngine() const { return tls().get_engine(); }
 
   // Remove all entries from the blob map
   void ResetBlobMap();
@@ -605,7 +611,6 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   }
 
  private:
-  mkldnn::engine engine_;
   std::shared_ptr<BlobMap> p_blobmap_;
   std::shared_ptr<std::mutex> p_mutex_;
   bool block_next_cache_clearing_ = false;
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 37747cd3fd302cc3ceba82e3d3bcea52cac8d5c8..79c536508da123750cb828e9aa38043049a857de 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -188,7 +188,7 @@ MKLDNNGetDataType<paddle::platform::bfloat16>() {
 inline void Reorder(mkldnn::memory src, mkldnn::memory dst,
                     const mkldnn::engine& engine) {
   auto reorder_prim = mkldnn::reorder(src, dst);
-  mkldnn::stream astream(engine);
+  auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
   platform::RecordEvent record_reorder("int_reorder",
                                        platform::EventRole::kUniqueOp);
   reorder_prim.execute(astream, src, dst);
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 55a230cabefaa0bed7ac6658e9acbce9aecd9bb4..37aae14c83a4dbabc5a156b3381e2e9e5a4c9d6c 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -232,7 +232,7 @@ class MKLDNNHandlerT {
       dev_ctx_.SetBlob(key_reorder_p, reorder_p);
     }
 
-    mkldnn::stream astream(engine_);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
     platform::RecordEvent record_reorder("int_reorder",
                                          platform::EventRole::kUniqueOp);
@@ -261,7 +261,7 @@ class MKLDNNHandlerT {
             std::make_shared<dnnl::reorder>(*user_memory_p, *target_memory_p);
         dev_ctx_.SetBlob(key_reorder_p, reorder_p);
 
-        mkldnn::stream astream(engine_);
+        auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
         platform::RecordEvent record_reorder("int_reorder",
                                              platform::EventRole::kUniqueOp);
         reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
@@ -273,7 +273,7 @@ class MKLDNNHandlerT {
       dev_ctx_.SetBlob(user_key, user_memory_p);
       dev_ctx_.SetBlob(target_key, target_memory_p);
     } else if (!is_persistent) {
-      mkldnn::stream astream(engine_);
+      auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
       auto user_memory_p =
           std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(user_key));
@@ -425,7 +425,7 @@ class MKLDNNHandler {
       auto reorder_p =
           std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
       dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-      mkldnn::stream astream(engine_);
+      auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
       platform::RecordEvent record_reorder("int_reorder",
                                            platform::EventRole::kUniqueOp);
       reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
@@ -451,7 +451,7 @@ class MKLDNNHandler {
     auto target_memory_p =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
 
-    mkldnn::stream astream(engine_);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
     if (target_memory_p == nullptr) {
       target_memory_p = user_memory_p;