Disable pool&conv_transpose&quantize caching (#36695)

* - WIP - compilation fix - fix - fixes - fix - fix - fix again - fix - another fix - another compilation fix - fix - fix - fix - lint * - pool2d partially stripped from cache - pool2d partially stripped of caching * - compilation fix * - compilation fix * - Fix to UT of caching * - Enabling test_conv3d_mkldnn * - conv_transpose stripped of cache * - compilation fix * - fix * - fix * - compilation fix * - fix * Reverted disabling caching of conv2d * - compilation fix * - ut reverted

Disable pool&conv_transpose&quantize caching (#36695)
* - WIP - compilation fix - fix - fixes - fix - fix - fix again - fix - another fix - another compilation fix - fix - fix - fix - lint * - pool2d partially stripped from cache - pool2d partially stripped of caching * - compilation fix * - compilation fix * - Fix to UT of caching * - Enabling test_conv3d_mkldnn * - conv_transpose stripped of cache * - compilation fix * - fix * - fix * - compilation fix * - fix * Reverted disabling caching of conv2d * - compilation fix * - ut reverted
db6c00c4 · Jacek Czaja · GitHub · 9a53477c · db6c00c4 · db6c00c4
8 changed file
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -21,7 +21,6 @@ namespace operators {
 using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
 using paddle::platform::CPUDeviceContext;
-using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;

--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -21,7 +21,6 @@ namespace operators {
 using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
 using paddle::platform::CPUDeviceContext;
-using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;

--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -565,7 +565,7 @@ class ConvMKLDNNHandlerT
      const auto target_mem_p = this->AcquireMemory(target_key_suffix);
      user_mem_p->set_data_handle(platform::to_void_cast<T>(in_mem_data));
      if (user_mem_p != target_mem_p) {
-        this->AcquireReorder(user_mem_p, target_mem_p, key_mem);
+        this->AcquireReorder(user_mem_p, target_mem_p);
      }
      return target_mem_p;
    }
@@ -643,7 +643,7 @@ class ConvMKLDNNHandlerT
        platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) {
      auto residual_memory_p = this->AcquireResidualMemory(residual_param);
      dst_memory_p = this->template AcquireDstMemory<T_out>(output);
-      this->AcquireReorder(residual_memory_p, dst_memory_p, "@residual_dst");
+      this->AcquireReorder(residual_memory_p, dst_memory_p);
    } else {
      // Changing ShareDataWith to TensorCopy results in performance drop
      // on ResNet architectures

--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -40,21 +40,17 @@ inline mkldnn::memory::dims GetWeightsTz(const Tensor* filter,
 template <typename T, typename K, typename T_out>
 class ConvTransposeMKLDNNHandlerT
-    : public platform::MKLDNNHandlerT<T, mkldnn::deconvolution_forward> {
+    : public platform::MKLDNNHandlerNoCachingT<T,
+                                               mkldnn::deconvolution_forward> {
 public:
  ConvTransposeMKLDNNHandlerT(const framework::ExecutionContext& ctx,
-                              const platform::MKLDNNDeviceContext& dev_ctx,
                              const mkldnn::engine mkldnn_engine,
-                              platform::Place cpu_place, const Tensor* input,
+                              const Tensor* input, const Tensor* filter,
-                              const Tensor* filter, const Tensor* bias,
+                              const Tensor* bias, Tensor* output)
-                              Tensor* output, const std::string& unique_name)
+      : platform::MKLDNNHandlerNoCachingT<T, mkldnn::deconvolution_forward>(
-      : platform::MKLDNNHandlerT<T, mkldnn::deconvolution_forward>(
+            mkldnn_engine, ctx.GetPlace()),
-            dev_ctx, mkldnn_engine, cpu_place,
+        is_test_(ctx.Attr<bool>("is_test")) {
-            platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
+    PADDLE_ENFORCE_EQ(is_test_, true,
-                                unique_name)) {
-    if (!this->isCached()) {
-      const bool is_test = ctx.Attr<bool>("is_test");
-      PADDLE_ENFORCE_EQ(is_test, true,
                      platform::errors::InvalidArgument(
                          "ConvTransposeMKLDNN works only for inference. "
                          "The attribute \'is_test\' value should be set to "
@@ -99,9 +95,9 @@ class ConvTransposeMKLDNNHandlerT
                        platform::errors::InvalidArgument(
                            "Got wrong format for Bias tensor."));
-        PADDLE_ENFORCE_EQ(bias->dims().size(), 1,
+      PADDLE_ENFORCE_EQ(
-                          platform::errors::InvalidArgument(
+          bias->dims().size(), 1,
-                              "Bias must only have 1 dimension, "
+          platform::errors::InvalidArgument("Bias must only have 1 dimension, "
                                            "i.e. X, but got dimension = %d .",
                                            bias->dims().size()));
    }
@@ -113,12 +109,10 @@ class ConvTransposeMKLDNNHandlerT
    mkldnn::memory::dims paddings(begin(paddings_temp), end(paddings_temp));
    std::vector<int> dilations_temp = ctx.Attr<std::vector<int>>("dilations");
-      mkldnn::memory::dims dilations(begin(dilations_temp),
+    mkldnn::memory::dims dilations(begin(dilations_temp), end(dilations_temp));
-                                     end(dilations_temp));
    int groups = ctx.Attr<int>("groups");
-      std::string padding_algorithm =
+    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-          ctx.Attr<std::string>("padding_algorithm");
    PADDLE_ENFORCE_EQ(
        strides.size(), 2,
@@ -169,22 +163,21 @@ class ConvTransposeMKLDNNHandlerT
    const mkldnn::primitive_attr conv_trans_attr =
        CreatePostOps(fuse_activation, fuse_alpha, fuse_beta);
-      auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
+    auto fwd_prop_kind = is_test_ ? mkldnn::prop_kind::forward_inference
                                  : mkldnn::prop_kind::forward_training;
    if (bias) {
      std::vector<int64_t> bias_tz = framework::vectorize(bias->dims());
      const auto bias_md =
          platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x);
      this->AcquireForwardPrimitiveDescriptor(
-            conv_trans_attr, fwd_prop_kind,
+          conv_trans_attr, fwd_prop_kind, dnnl::algorithm::deconvolution_direct,
-            dnnl::algorithm::deconvolution_direct, src_md, weights_md, bias_md,
+          src_md, weights_md, bias_md, dst_md, strides, dilations,
-            dst_md, strides, dilations, mkldnn_paddings[0], mkldnn_paddings[1]);
+          mkldnn_paddings[0], mkldnn_paddings[1]);
    } else {
      this->AcquireForwardPrimitiveDescriptor(
-            conv_trans_attr, fwd_prop_kind,
+          conv_trans_attr, fwd_prop_kind, dnnl::algorithm::deconvolution_direct,
-            dnnl::algorithm::deconvolution_direct, src_md, weights_md, dst_md,
+          src_md, weights_md, dst_md, strides, dilations, mkldnn_paddings[0],
-            strides, dilations, mkldnn_paddings[0], mkldnn_paddings[1]);
+          mkldnn_paddings[1]);
-      }
    }
  }
@@ -217,34 +210,17 @@ class ConvTransposeMKLDNNHandlerT
  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryWithReorder(
      const framework::Tensor* input) {
    const T* input_data = input->data<T>();
-    const std::string user_key_suffix{"@src_mem_p_user"};
-    auto user_src_mem_p = this->AcquireMemory(user_key_suffix);
-    if (!user_src_mem_p) {
    auto user_src_md = platform::MKLDNNMemDesc(
        framework::vectorize(input->dims()), platform::MKLDNNGetDataType<T>(),
        input->format());
-      return this->AcquireMemoryWithReorder(
+    return platform::MKLDNNHandlerNoCachingT<T, mkldnn::deconvolution_forward>::
-          user_src_md, this->fwd_pd_->src_desc(),
+        AcquireMemoryWithReorder(user_src_md, this->fwd_pd_->src_desc(),
-          platform::to_void_cast<T>(input_data), "@src_mem_p");
+                                 platform::to_void_cast<T>(input_data));
-    } else {
-      const std::string target_key_suffix{"@src_mem_p_target"};
-      const auto target_src_mem_p = this->AcquireMemory(target_key_suffix);
-      user_src_mem_p->set_data_handle(platform::to_void_cast<T>(input_data));
-      if (user_src_mem_p != target_src_mem_p) {
-        this->AcquireReorder(user_src_mem_p, target_src_mem_p, "@src_mem_p");
-      }
-      return target_src_mem_p;
-    }
  }
  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryWithReorder(
-      const framework::Tensor* filter, const int& groups, const bool& is_test) {
+      const platform::MKLDNNDeviceContext& dev_ctx, const std::string& key,
-    // This is workaround to make execution faster, delete
+      const framework::Tensor* filter, const int& groups) {
-    // if statement after including md inside Tensor
-    auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target");
-    if (is_test && weights_mem_p) {
-      return weights_mem_p;
-    } else {
    const K* filter_data = filter->data<K>();
    auto weights_tz = GetWeightsTz(filter, groups);
    int g = std::max(groups, 1);
@@ -276,27 +252,98 @@ class ConvTransposeMKLDNNHandlerT
    };
    return this->template AcquireMemoryWithReorder<K>(
-          user_src_md, this->fwd_pd_->weights_desc(),
+        dev_ctx, user_src_md, this->fwd_pd_->weights_desc(),
-          platform::to_void_cast<K>(filter_data), "@weights_mem_p", is_test,
+        platform::to_void_cast<K>(filter_data), key, "@weights_mem_p", is_test_,
        iohw2oihw_reorder);
  }
+  template <typename F = T>
+  std::shared_ptr<mkldnn::memory> AcquireMemoryWithReorder(
+      const platform::MKLDNNDeviceContext& dev_ctx,
+      const mkldnn::memory::desc& user_md,
+      const mkldnn::memory::desc& target_md, void* ptr, const std::string& key,
+      const std::string& suffix, bool is_persistent = false,
+      std::function<std::shared_ptr<F>(const F*)> custom_reorder_func = {},
+      const std::vector<float>& scale_data = {1.0f}, int mask = 0) {
+    const auto target_key = key + suffix + "_target";
+    const auto key_reorder_p = key + suffix + "reorder_p";
+    const auto user_key = key + suffix + "_user";
+    auto target_memory_p =
+        std::static_pointer_cast<dnnl::memory>(dev_ctx.GetBlob(target_key));
+    if (target_memory_p == nullptr) {
+      if (custom_reorder_func) {
+        auto reordered_data =
+            custom_reorder_func(reinterpret_cast<const F*>(ptr));
+        dev_ctx.SetBlob(key_reorder_p + "-custom_reorder", reordered_data);
+        ptr = reinterpret_cast<void*>(reordered_data.get());
      }
+      auto user_memory_p =
+          std::make_shared<dnnl::memory>(user_md, this->engine_, ptr);
+      if (user_md != target_md) {
+        target_memory_p =
+            std::make_shared<mkldnn::memory>(target_md, this->engine_);
+        dnnl::reorder::primitive_desc reorder_pdesc;
+        if (platform::is_int8<T>()) {
+          dnnl::primitive_attr attr;
+          attr.set_output_scales(mask, scale_data);
+          reorder_pdesc = dnnl::reorder::primitive_desc(*user_memory_p,
+                                                        *target_memory_p, attr);
+        } else {
+          reorder_pdesc =
+              dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p);
+        }
+        auto reorder_p = std::make_shared<dnnl::reorder>(reorder_pdesc);
+        dev_ctx.SetBlob(key_reorder_p, reorder_p);
-  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryWithReorder(
+        auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-      const framework::Tensor* bias, const bool& is_test) {
+        platform::RecordEvent record_reorder("int_reorder",
-    auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target");
+                                             platform::EventRole::kUniqueOp);
-    if (is_test && bias_mem_p) {
+        reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
-      return bias_mem_p;
+                                     {MKLDNN_ARG_TO, *target_memory_p}});
+        astream.wait();
      } else {
+        target_memory_p = user_memory_p;
+      }
+      dev_ctx.SetBlob(user_key, user_memory_p);
+      dev_ctx.SetBlob(target_key, target_memory_p);
+    } else if (!is_persistent) {
+      auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+      auto user_memory_p =
+          std::static_pointer_cast<dnnl::memory>(dev_ctx.GetBlob(user_key));
+      user_memory_p->set_data_handle(ptr);
+      // TODO(jczaja): Here we detect if reorder is cached it means it is needed
+      // need to change this to get rid of keys
+      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+          dev_ctx.GetBlob(key_reorder_p));
+      if (reorder_p != nullptr) {
+        platform::RecordEvent record_reorder("int_reorder",
+                                             platform::EventRole::kUniqueOp);
+        reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
+                                     {MKLDNN_ARG_TO, *target_memory_p}});
+        astream.wait();
+      }
+    }
+    return target_memory_p;
+  }
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryWithReorder(
+      const platform::MKLDNNDeviceContext& dev_ctx, const std::string& key,
+      const framework::Tensor* bias) {
    const K* bias_data = bias->data<K>();
    auto user_bias_md = platform::MKLDNNMemDesc(
        framework::vectorize(bias->dims()), platform::MKLDNNGetDataType<K>(),
        MKLDNNMemoryFormat::x);
    return this->AcquireMemoryWithReorder(
-          user_bias_md, this->fwd_pd_->bias_desc(),
+        dev_ctx, user_bias_md, this->fwd_pd_->bias_desc(),
-          platform::to_void_cast<K>(bias_data), "@bias_mem_p", is_test);
+        platform::to_void_cast<K>(bias_data), key, "@bias_mem_p", is_test_);
-    }
  }
+ private:
+  const bool is_test_;
 };
 template <typename T, typename K>
@@ -325,22 +372,21 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel<T> {
        ctx.template device_context<platform::MKLDNNDeviceContext>();
    const auto& mkldnn_engine = dev_ctx.GetEngine();
-    const bool is_test = ctx.Attr<bool>("is_test");
    const auto* input = ctx.Input<Tensor>("Input");
    const auto* filter = ctx.Input<Tensor>("Filter");
    const auto* bias =
        ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
    auto* output = ctx.Output<Tensor>("Output");
-    const std::string unique_name = ctx.InputName("Input") +
+    ConvTransposeMKLDNNHandlerT<T, K, T_out> handler(ctx, mkldnn_engine, input,
-                                    ctx.InputName("Filter") +
+                                                     filter, bias, output);
-                                    (bias ? ctx.InputName("Bias") : "");
-    ConvTransposeMKLDNNHandlerT<T, K, T_out> handler(
-        ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias,
-        output, unique_name);
    auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
+    // Caching Key for weights is needed
+    std::string key = platform::CreateKey(dev_ctx, ctx.InputName("Input"),
+                                          ctx.InputName("Filter"),
+                                          (bias ? ctx.InputName("Bias") : ""));
+    key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
    auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
-        filter, ctx.Attr<int>("groups"), is_test);
+        dev_ctx, key, filter, ctx.Attr<int>("groups"));
    std::shared_ptr<dnnl::memory> dst_memory_p =
        handler.template AcquireDstMemory<T_out>(output);
@@ -352,7 +398,8 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel<T> {
        {MKLDNN_ARG_DST, *dst_memory_p}};
    if (bias) {
-      auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias, is_test);
+      auto bias_memory_p =
+          handler.AcquireBiasMemoryWithReorder(dev_ctx, key, bias);
      args.insert({MKLDNN_ARG_BIAS, *bias_memory_p});
    }
    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();

--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -30,20 +30,15 @@ using platform::to_void_cast;
 template <typename T>
 class PoolingMKLDNNHandler
-    : public platform::MKLDNNHandlerT<T, mkldnn::pooling_forward,
+    : public platform::MKLDNNHandlerNoCachingT<T, mkldnn::pooling_forward,
                                               mkldnn::pooling_backward> {
 public:
  PoolingMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
+                       const mkldnn::engine mkldnn_engine, const Tensor* input,
-                       platform::Place cpu_place, const Tensor* input,
+                       Tensor* output)
-                       Tensor* output, const std::string& unique_name)
+      : platform::MKLDNNHandlerNoCachingT<T, mkldnn::pooling_forward,
-      : platform::MKLDNNHandlerT<T, mkldnn::pooling_forward,
                                          mkldnn::pooling_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
+            mkldnn_engine, ctx.GetPlace()) {
-            platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
-                                framework::ToMKLDNNDataType(input->type()),
-                                unique_name)) {
-    if (!this->isCached()) {
    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
                      platform::errors::InvalidArgument(
                          "Wrong layout set for Input tensor."));
@@ -131,29 +126,22 @@ class PoolingMKLDNNHandler
            : (exclude_padding
                   ? mkldnn::algorithm::pooling_avg_exclude_padding
                   : mkldnn::algorithm::pooling_avg_include_padding),
-          src_md, dst_md, strides, ksize, mkldnn_paddings[0],
+        src_md, dst_md, strides, ksize, mkldnn_paddings[0], mkldnn_paddings[1]);
-          mkldnn_paddings[1]);
-    }
  }
  PoolingMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
+                       const mkldnn::engine mkldnn_engine, const Tensor* in_x,
-                       platform::Place cpu_place, const Tensor* in_x,
+                       const Tensor* out_grad, Tensor* in_x_grad)
-                       const Tensor* out_grad, Tensor* in_x_grad,
-                       const std::string& unique_name)
+      : platform::MKLDNNHandlerNoCachingT<T, mkldnn::pooling_forward,
-      : platform::MKLDNNHandlerT<T, mkldnn::pooling_forward,
                                          mkldnn::pooling_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
+            mkldnn_engine, ctx.GetPlace()) {
-            platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()),
+    PADDLE_ENFORCE_EQ(
-                                framework::ToMKLDNNDataType(in_x->type()),
+        in_x->layout(), DataLayout::kMKLDNN,
-                                unique_name)) {
+        platform::errors::InvalidArgument("Wrong layout set for Input tensor"));
-    if (!this->isBwdCached()) {
+    PADDLE_ENFORCE_NE(
-      PADDLE_ENFORCE_EQ(in_x->layout(), DataLayout::kMKLDNN,
+        in_x->format(), MKLDNNMemoryFormat::undef,
-                        platform::errors::InvalidArgument(
+        platform::errors::InvalidArgument("Wrong format set for Input tensor"));
-                            "Wrong layout set for Input tensor"));
-      PADDLE_ENFORCE_NE(in_x->format(), MKLDNNMemoryFormat::undef,
-                        platform::errors::InvalidArgument(
-                            "Wrong format set for Input tensor"));
    PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN,
                      platform::errors::InvalidArgument(
@@ -179,8 +167,7 @@ class PoolingMKLDNNHandler
    std::vector<int64_t> paddings(begin(paddings_temp), end(paddings_temp));
    bool global_pooling = ctx.Attr<bool>("global_pooling");
-      std::string padding_algorithm =
+    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-          ctx.Attr<std::string>("padding_algorithm");
    auto in_x_dims = in_x->dims();
    framework::DDim data_dims =
@@ -194,10 +181,8 @@ class PoolingMKLDNNHandler
                             data_dims, strides, ksize);
    auto src_tz = paddle::framework::vectorize<int64_t>(in_x->dims());
-      auto diff_src_tz =
+    auto diff_src_tz = paddle::framework::vectorize<int64_t>(in_x_grad->dims());
-          paddle::framework::vectorize<int64_t>(in_x_grad->dims());
+    auto diff_dst_tz = paddle::framework::vectorize<int64_t>(out_grad->dims());
-      auto diff_dst_tz =
-          paddle::framework::vectorize<int64_t>(out_grad->dims());
    const auto dt = framework::ToMKLDNNDataType(in_x->type());
    auto src_md = mkldnn::memory::desc(src_tz, dt, in_x->format());
@@ -205,9 +190,8 @@ class PoolingMKLDNNHandler
        mkldnn::memory::desc(diff_dst_tz, dt, MKLDNNMemoryFormat::any);
    auto diff_dst_md = mkldnn::memory::desc(
        diff_dst_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());
-      auto diff_src_md =
+    auto diff_src_md = mkldnn::memory::desc(
-          mkldnn::memory::desc(diff_src_tz, platform::MKLDNNGetDataType<T>(),
+        diff_src_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
-                               MKLDNNMemoryFormat::any);
    auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
    const bool ceil_mode = ctx.Attr<bool>("ceil_mode");
@@ -227,8 +211,7 @@ class PoolingMKLDNNHandler
            : (exclude_padding
                   ? mkldnn::algorithm::pooling_avg_exclude_padding
                   : mkldnn::algorithm::pooling_avg_include_padding),
-          src_md, dst_md, strides, ksize, mkldnn_paddings[0],
+        src_md, dst_md, strides, ksize, mkldnn_paddings[0], mkldnn_paddings[1]);
-          mkldnn_paddings[1]);
    this->AcquireBackwardPrimitiveDescriptor(
        pooling_type == "max"
@@ -239,25 +222,28 @@ class PoolingMKLDNNHandler
        diff_src_md, diff_dst_md, strides, ksize, mkldnn_paddings[0],
        mkldnn_paddings[1]);
  }
-  }
-  std::shared_ptr<mkldnn::memory> AcquireWorkspaceMemory(void) {
+  std::shared_ptr<mkldnn::memory> AcquireWorkspaceMemory(
+      const platform::MKLDNNDeviceContext& dev_ctx,
+      const std::string& unique_name) {
    mkldnn::memory::desc workspace_md = this->fwd_pd_->workspace_desc();
-    // Pooling PD has to be passed to Grad op that
+    // Pooling Workspace has to be passed to Grad op that
    // may be executed by diffrent thread, hence
    // for that one we use key that does not contain TID
-    auto local_key = this->key_common_ + "@workspace";
+    std::string workspace_key =
+        platform::CreateKey(dev_ctx, workspace_md.dims(),
+                            workspace_md.data_type(), unique_name, "@wrk");
    auto mem_p = std::static_pointer_cast<mkldnn::memory>(
-        this->dev_ctx_.GetBlob(local_key));
+        dev_ctx.GetBlob(workspace_key));
    if (mem_p == nullptr) {
      static std::mutex acquire_barrier;
      std::lock_guard<std::mutex> block_threads_until_finish_this_job(
          acquire_barrier);
      mem_p = std::static_pointer_cast<mkldnn::memory>(
-          this->dev_ctx_.GetBlob(local_key));
+          dev_ctx.GetBlob(workspace_key));
      if (mem_p == nullptr) {
        mem_p = std::make_shared<mkldnn::memory>(workspace_md, this->engine_);
-        this->dev_ctx_.SetBlob(local_key, mem_p);
+        dev_ctx.SetBlob(workspace_key, mem_p);
      }
    }
    return mem_p;
@@ -319,8 +305,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    const Tensor* input = ctx.Input<Tensor>("X");
    Tensor* output = ctx.Output<Tensor>("Out");
-    PoolingMKLDNNHandler<T> handler(ctx, dev_ctx, ctx.GetPlace(), input, output,
+    PoolingMKLDNNHandler<T> handler(ctx, dev_ctx.GetEngine(), input, output);
-                                    ctx.OutputName("Out"));
    auto src_memory = handler.AcquireSrcMemory(input);
    auto dst_memory = handler.AcquireDstMemory(output);
@@ -331,7 +316,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    if ((ctx.Attr<bool>("is_test") == false) &&
        (ctx.Attr<std::string>("pooling_type") == "max")) {
      // Training
-      auto workspace_memory = handler.AcquireWorkspaceMemory();
+      auto workspace_memory =
+          handler.AcquireWorkspaceMemory(dev_ctx, ctx.OutputName("Out"));
      pool_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory},
                                {MKLDNN_ARG_DST, *dst_memory},
                                {MKLDNN_ARG_WORKSPACE, *workspace_memory}});
@@ -361,8 +347,8 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    auto& dev_ctx =
        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    PoolingMKLDNNHandler<T> handler(ctx, dev_ctx, ctx.GetPlace(), in_x,
+    PoolingMKLDNNHandler<T> handler(ctx, dev_ctx.GetEngine(), in_x, out_grad,
-                                    out_grad, in_x_grad, ctx.InputName("Out"));
+                                    in_x_grad);
    auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad);
    auto diff_src_memory = handler.AcquireDiffSrcMemory(in_x_grad);
@@ -372,7 +358,8 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
    if (ctx.Attr<std::string>("pooling_type") == "max") {
      // Max - pooling needs Workspace
-      auto workspace_memory = handler.AcquireWorkspaceMemory();
+      auto workspace_memory =
+          handler.AcquireWorkspaceMemory(dev_ctx, ctx.InputName("Out"));
      pool_bwd_p->execute(astream, {{MKLDNN_ARG_DIFF_SRC, *diff_src_memory},
                                    {MKLDNN_ARG_DIFF_DST, *diff_dst_memory},
                                    {MKLDNN_ARG_WORKSPACE, *workspace_memory}});

--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -64,21 +64,11 @@ class QuantOpKernel : public framework::OpKernel<T> {
    bool is_negative_input = ctx.Attr<bool>("is_negative_input");
    bool bfloat16 = ctx.Attr<bool>("bfloat16");
-    std::string key =
+    // TODO(jczaja): Refactor with Acquire API
-        platform::CreateKey(dev_ctx, src_tz, scale_data, scale_shift,
-                            is_negative_input, ctx.OutputName("Output"));
-    key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
-    const std::string key_prim = key + "@r";
-    const std::string key_src_mem = key + "@s";
-    const std::string key_dst_mem = key + "@d";
    std::shared_ptr<mkldnn::memory> src_memory;
    std::shared_ptr<mkldnn::memory> dst_memory;
    std::shared_ptr<reorder> reorder_p;
-    reorder_p = std::static_pointer_cast<reorder>(dev_ctx.GetBlob(key_prim));
-    if (reorder_p == nullptr) {
    std::string out_layout = ctx.Attr<std::string>("output_format");
    MKLDNNMemoryFormat out_format =
        platform::data_format_to_memory_format(out_layout);
@@ -97,8 +87,8 @@ class QuantOpKernel : public framework::OpKernel<T> {
    auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32,
                                          input->format());
-      src_memory = std::make_shared<mkldnn::memory>(
+    src_memory = std::make_shared<mkldnn::memory>(src_md, engine,
-          src_md, engine, to_void_cast<T>(input_data));
+                                                  to_void_cast<T>(input_data));
    std::shared_ptr<mkldnn::memory::desc> dst_md;
    if (bfloat16) {
@@ -108,38 +98,13 @@ class QuantOpKernel : public framework::OpKernel<T> {
      platform::SetDstMemoryQuantized<int8_t>(ctx, output, dst_tz, engine,
                                              dst_md, dst_memory, out_format);
    } else {
-        platform::SetDstMemoryQuantized<uint8_t>(
+      platform::SetDstMemoryQuantized<uint8_t>(ctx, output, dst_tz, engine,
-            ctx, output, dst_tz, engine, dst_md, dst_memory, out_format);
+                                               dst_md, dst_memory, out_format);
    }
    auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
        new reorder::primitive_desc(*src_memory, *dst_memory, attri));
    reorder_p = std::shared_ptr<reorder>(new reorder(*reorder_pd));
-      dev_ctx.SetBlob(key_prim, reorder_p);
-      dev_ctx.SetBlob(key_src_mem, src_memory);
-      dev_ctx.SetBlob(key_dst_mem, dst_memory);
-    } else {
-      src_memory = std::static_pointer_cast<mkldnn::memory>(
-          dev_ctx.GetBlob(key_src_mem));
-      src_memory->set_data_handle(to_void_cast<T>(input_data));
-      dst_memory = std::static_pointer_cast<mkldnn::memory>(
-          dev_ctx.GetBlob(key_dst_mem));
-      auto place = ctx.GetPlace();
-      if (bfloat16) {
-        dst_memory->set_data_handle(
-            output->mutable_data<paddle::platform::bfloat16>(place));
-      } else if (with_shift || !is_negative_input) {
-        uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace());
-        if (with_shift) std::memset(output_data, scale_shift, output->numel());
-        dst_memory->set_data_handle(output_data);
-      } else {
-        dst_memory->set_data_handle(
-            output->mutable_data<int8_t>(ctx.GetPlace()));
-      }
-    }
    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
    {
      platform::RecordEvent record_reorder("int_reorder",

--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -207,7 +207,7 @@ class MKLDNNHandlerNoCachingT {
  std::shared_ptr<mkldnn::memory> AcquireMemoryWithReorder(
      const mkldnn::memory::desc& user_md,
      const mkldnn::memory::desc& target_md, void* ptr,
-      const std::string& suffix, bool is_persistent = false,
+      bool is_persistent = false,
      std::function<std::shared_ptr<F>(const F*)> custom_reorder_func = {}) {
    std::shared_ptr<mkldnn::memory> target_memory_p;
    if (custom_reorder_func) {
@@ -500,18 +500,9 @@ class MKLDNNHandlerT {
  }
  void AcquireReorder(const std::shared_ptr<mkldnn::memory>& user_memory_p,
-                      const std::shared_ptr<mkldnn::memory>& target_memory_p,
+                      const std::shared_ptr<mkldnn::memory>& target_memory_p) {
-                      const std::string& suffix) {
+    auto reorder_p =
-    const auto key_reorder_p = key_ + suffix + "reorder_p";
-    auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-        dev_ctx_.GetBlob(key_reorder_p));
-    if (reorder_p == nullptr) {
-      reorder_p =
        std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
-      dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-    }
    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
@@ -578,6 +569,8 @@ class MKLDNNHandlerT {
          std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(user_key));
      user_memory_p->set_data_handle(ptr);
+      // TODO(jczaja): Here we detect if reorder is cached it means it is needed
+      // need to change this to get rid of keys
      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
          dev_ctx_.GetBlob(key_reorder_p));
      if (reorder_p != nullptr) {

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
@@ -95,4 +95,6 @@ class TestConv3DOp_Valid_MKLDNN(TestConv3DOp_AsyPadding_MKLDNN):
 if __name__ == '__main__':
+    from paddle import enable_static
+    enable_static()
    unittest.main()