Revert "Added vanilla LSTM and LSTM with peepholes oneDNN fp32 kernel (#30661)" (#30708)

This reverts commit d834f4e6.

Revert "Added vanilla LSTM and LSTM with peepholes oneDNN fp32 kernel (#30661)" (#30708)
This reverts commit d834f4e6.
824a79d3 · Tao Luo · GitHub · 7fbc68a2 · 824a79d3 · 824a79d3
12 changed file
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -197,7 +197,7 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op"
+"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
 "fused_bn_add_activation_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)

--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -14,15 +14,11 @@ register_operators(EXCLUDES
    fused_embedding_eltwise_layernorm_op
    fusion_group_op
    fusion_gru_op
-    fusion_lstm_op
    fused_bn_add_activation_op)
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
-op_library(fusion_lstm_op)
+file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_gru);\n")
-file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_gru);\nUSE_CPU_ONLY_OP(fusion_lstm);\n")
 if (WITH_GPU)
    # fused_bn_activation_op needs cudnn 7.4.1 above

--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -18,9 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
 namespace paddle {
 namespace operators {
@@ -148,17 +145,8 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library = framework::LibraryType::kPlain;
-  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-#ifdef PADDLE_WITH_MKLDNN
-  if (this->CanMKLDNNBeUsed(ctx)) {
-    library = framework::LibraryType::kMKLDNN;
-    layout = framework::DataLayout::kMKLDNN;
-  }
-#endif
  return framework::OpKernelType(
-      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout,
+      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context());
-      library);
 }
 void FusionLSTMOpMaker::Make() {
@@ -247,9 +235,6 @@ void FusionLSTMOpMaker::Make() {
                       "`tanh` by default.")
      .SetDefault("tanh")
      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddAttr<bool>("use_mkldnn",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
  AddComment(R"DOC(
 Fusion Long-Short Term Memory (LSTM) Operator.
 This operator fuse the X into LSTM, more details can refer to LSTM op.

--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_gru_op.h"
-#include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 namespace paddle {
 namespace operators {
@@ -27,7 +27,7 @@ using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;
 template <typename T, typename T_out = T>
-class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
+class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
 public:
  GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
                   const platform::MKLDNNDeviceContext& dev_ctx,
@@ -37,12 +37,37 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
                   const bool is_reverse, const int64_t N, const int64_t Ti,
                   const int64_t IC, const int64_t OC,
                   const std::string& unique_name)
-      : RNNMKLDNNHandler<T, dnnl::gru_forward, T_out>(
+      : platform::MKLDNNHandlerT<T, dnnl::gru_forward>(
-            ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
+            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            is_reverse, N, Ti, IC, OC, 3,
+            CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>(), Ti)),
-            ctx.InputName("X") + ctx.InputName("WeightH")) {
+        N(N),
+        Ti(Ti),
+        IC(IC),
+        OC(OC) {
+    // Create memory key without Ti because weights, bias and h0 memories
+    // do not depend on Ti size but primitive and input/output memory do
+    memory_key_ = platform::ExtendKeyWithThreadInfoIfNeeded(
+        dev_ctx, CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>()));
+    // Is it int8 kernel
    const bool is_INT8 = std::is_same<T, uint8_t>::value;
+    if (is_INT8) {
+      // Int8 attributes
+      const float scale_data = ctx.Attr<float>("Scale_data");
+      const float shift_data = ctx.Attr<float>("Shift_data");
+      const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
+      const int weights_scale_mask =
+          0 +
+          (1 << 3)  // bit, indicating the unique scales for `g` dim in `ldigo`
+          +
+          (1 << 4);  // bit, indicating the unique scales for `o` dim in `ldigo`
+      attr_.set_rnn_data_qparams(scale_data, shift_data);
+      attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
+    }
    if (!this->isCached()) {
      // oneDNN kernel has hardcoded activation functions
      PADDLE_ENFORCE_EQ(
@@ -83,35 +108,176 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
                     : dnnl::rnn_direction::unidirectional_left2right;
      this->AcquireForwardPrimitiveDescriptor(
-          this->attr_, dnnl::prop_kind::forward_inference, direction, input_md,
+          attr_, dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
-          h0_md, weight_x_md, weight_h_md, bias_md, hidden_md,
+          weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc());
-          dnnl::memory::desc());
+    }
+  }
+  bool is_NTC() {
+    return (platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc()) ==
+            dnnl::memory::format_tag::ntc);
+  }
+  void reorderRNNdata(void* input_data, void* output_data,
+                      std::vector<size_t> lod, const bool is_reverse,
+                      platform::RNNReorderType reorder_type) {
+    switch (reorder_type) {
+      // Reorder input memory [WORDS, C] + LoD -> [N, T, C]
+      case platform::RNNReorderType::PP_NTC: {
+        auto* input_data_iter = reinterpret_cast<T*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T*>(output_data);
+        for (int n = 0; n < N; ++n) {
+          const auto num_elements = (lod[n + 1] - lod[n]) * IC;
+          const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
+          memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
+                 sizeof(T) * num_elements);
+          input_data_iter += num_elements;
+        }
+      } break;
+      // Reorder input memory [WORDS, C] + LoD -> [T, N, C]
+      case platform::RNNReorderType::PP_TNC: {
+        auto* input_data_iter = reinterpret_cast<T*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T*>(output_data);
+        for (int n = 0; n < N; ++n) {
+          const auto num_elements = (lod[n + 1] - lod[n]);
+          const auto offset = is_reverse ? (Ti - num_elements) : 0;
+          for (size_t t = 0; t < num_elements; ++t) {
+            memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
+                   input_data_iter, sizeof(T) * IC);
+            input_data_iter += IC;
+          }
+        }
+      } break;
+      // Reorder output values to PP format [N, T, C] -> [WORDS, C]
+      case platform::RNNReorderType::NTC_PP: {
+        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
+        for (int n = 0; n < N; ++n) {
+          const auto num_elements = (lod[n + 1] - lod[n]) * OC;
+          const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
+          memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
+                 sizeof(T_out) * num_elements);
+          output_data_iter += num_elements;
+        }
+      } break;
+      // Reorder output values to PP format [T, N, C] -> [WORDS, C]
+      case platform::RNNReorderType::TNC_PP: {
+        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
+        for (int n = 0; n < N; ++n) {
+          const auto num_elements = lod[n + 1] - lod[n];
+          const auto offset = is_reverse ? (Ti - num_elements) : 0;
+          for (size_t t = 0; t < num_elements; ++t) {
+            memcpy(output_data_iter,
+                   input_data_iter + (t + offset) * N * OC + n * OC,
+                   sizeof(T_out) * OC);
+            output_data_iter += OC;
+          }
+        }
+      } break;
    }
  }
+  std::shared_ptr<dnnl::memory> AcquireInputMemoryWithReorder(
+      const LoDTensor* input, const bool is_reverse) {
+    const auto name = this->key_ + "@input_mem";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
+    if (!memory_p) {
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_desc(),
+                                                this->engine_);
+      this->dev_ctx_.SetBlob(name, memory_p);
+    }
+    const auto& input_lod = input->lod()[0];
+    auto* x_data = to_void_cast(input->data<T>());
+    auto* x_onednn_data = memory_p->get_data_handle();
+    memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
+    if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
+        dnnl::memory::format_tag::ntc) {
+      reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
+                     platform::RNNReorderType::PP_NTC);
+    } else {
+      reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
+                     platform::RNNReorderType::PP_TNC);
+    }
+    return memory_p;
+  }
+  std::shared_ptr<dnnl::memory> AcquireOutputMemory() {
+    const auto name = this->key_ + "@output_mem";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
+    if (!memory_p) {
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->dst_desc(),
+                                                this->engine_);
+      this->dev_ctx_.SetBlob(name, memory_p);
+    }
+    return memory_p;
+  }
+  // TODO(grygielski) H0 is for now persistable
+  // TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
+  // not support in yet)
+  std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
+    const std::string h0_key = memory_key_ + "@h0";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
+    if (!memory_p) {
+      auto user_h0_memory = dnnl::memory();
+      if (h0) {
+        user_h0_memory =
+            dnnl::memory({{1, 1, N, OC},
+                          MKLDNNGetDataType<float>(),
+                          MKLDNNMemoryFormat::ldnc},
+                         this->engine_, to_void_cast(h0->data<float>()));
+      } else {
+        user_h0_memory = dnnl::memory({{1, 1, N, OC},
+                                       MKLDNNGetDataType<float>(),
+                                       MKLDNNMemoryFormat::ldnc},
+                                      this->engine_);
+        memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
+      }
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
+                                                this->engine_);
+      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
+      dnnl::reorder(user_h0_memory, *memory_p, attr_)
+          .execute(astream, user_h0_memory, *memory_p);
+      this->dev_ctx_.SetBlob(h0_key, memory_p);
+    }
+    return memory_p;
+  }
  std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x,
                                                     const bool origin_mode) {
-    const std::string wx_key = this->memory_key_ + "@weight_x";
+    const std::string wx_key = memory_key_ + "@weight_x";
    auto memory_p =
        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key));
    if (!memory_p) {
      auto user_md =
-          MKLDNNMemDesc({1, 1, this->IC, this->G, this->OC},
+          MKLDNNMemDesc({1, 1, IC, 3, OC}, MKLDNNGetDataType<float>(),
-                        MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo);
+                        MKLDNNMemoryFormat::ldigo);
      auto user_memory = dnnl::memory(user_md, this->engine_);
      auto* weight_x_data =
          reinterpret_cast<float*>(user_memory.get_data_handle());
      memcpy(weight_x_data, weight_x->data<float>(),
-             sizeof(float) * this->IC * this->G * this->OC);
+             sizeof(float) * IC * 3 * OC);
      if (origin_mode == false) {
-        for (int64_t i = 0; i < this->IC; ++i) {
+        for (int64_t i = 0; i < IC; ++i) {
-          for (int64_t j = 0; j < this->OC; ++j) {
+          for (int64_t j = 0; j < OC; ++j) {
            weight_x_data[j] *= -1;
          }
-          weight_x_data += 3 * this->OC;
+          weight_x_data += 3 * OC;
        }
      }
@@ -119,7 +285,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
          this->fwd_pd_->weights_layer_desc(), this->engine_);
      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-      dnnl::reorder(user_memory, *memory_p, this->attr_)
+      dnnl::reorder(user_memory, *memory_p, attr_)
          .execute(astream, user_memory, *memory_p);
      this->dev_ctx_.SetBlob(wx_key, memory_p);
@@ -129,14 +295,14 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
  std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h,
                                                     const bool origin_mode) {
-    const std::string wh_key = this->memory_key_ + "@weight_h";
+    const std::string wh_key = memory_key_ + "@weight_h";
    auto memory_p =
        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key));
    if (!memory_p) {
      auto user_md =
-          MKLDNNMemDesc({1, 1, this->OC, this->G, this->OC},
+          MKLDNNMemDesc({1, 1, OC, 3, OC}, MKLDNNGetDataType<float>(),
-                        MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo);
+                        MKLDNNMemoryFormat::ldigo);
      auto user_memory = dnnl::memory(user_md, this->engine_);
      // Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to
@@ -146,26 +312,25 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
      auto* user_weight_h_data = weight_h->data<float>();
      auto src1_iter = user_weight_h_data;
-      auto src2_iter = user_weight_h_data + 2 * this->OC * this->OC;
+      auto src2_iter = user_weight_h_data + 2 * OC * OC;
-      for (int64_t c = 0; c < this->OC; ++c) {
+      for (int64_t c = 0; c < OC; ++c) {
-        memcpy(weight_h_data, src1_iter, 2 * this->OC * sizeof(float));
+        memcpy(weight_h_data, src1_iter, 2 * OC * sizeof(float));
-        memcpy(weight_h_data + 2 * this->OC, src2_iter,
+        memcpy(weight_h_data + 2 * OC, src2_iter, OC * sizeof(float));
-               this->OC * sizeof(float));
-        src1_iter += 2 * this->OC;
+        src1_iter += 2 * OC;
-        src2_iter += this->OC;
+        src2_iter += OC;
-        weight_h_data += 3 * this->OC;
+        weight_h_data += 3 * OC;
      }
      weight_h_data = reinterpret_cast<float*>(user_memory.get_data_handle());
      if (origin_mode == false) {
-        for (int64_t i = 0; i < this->OC; ++i) {
+        for (int64_t i = 0; i < OC; ++i) {
-          for (int64_t j = 0; j < this->OC; ++j) {
+          for (int64_t j = 0; j < OC; ++j) {
            weight_h_data[j] *= -1;
          }
-          weight_h_data += 3 * this->OC;
+          weight_h_data += 3 * OC;
        }
      }
@@ -173,7 +338,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
          this->fwd_pd_->weights_iter_desc(), this->engine_);
      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-      dnnl::reorder(user_memory, *memory_p, this->attr_)
+      dnnl::reorder(user_memory, *memory_p, attr_)
          .execute(astream, user_memory, *memory_p);
      this->dev_ctx_.SetBlob(wh_key, memory_p);
@@ -183,7 +348,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
  std::shared_ptr<dnnl::memory> AcquireBiasMemory(const Tensor* bias,
                                                  const bool origin_mode) {
-    const std::string bias_key = this->memory_key_ + "@bias";
+    const std::string bias_key = memory_key_ + "@bias";
    auto memory_p = std::static_pointer_cast<dnnl::memory>(
        this->dev_ctx_.GetBlob(bias_key));
@@ -194,15 +359,15 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
      if (bias) {
        const float* user_bias_data =
            bias->data<float>();  // Bias in oneDNN is always float
-        memcpy(bias_data, user_bias_data, sizeof(float) * this->G * this->OC);
+        memcpy(bias_data, user_bias_data, sizeof(float) * 3 * OC);
      } else {
        // oneDNN always need bias memory, if it's not provided in PP, let
        // oneDNN allocate memory and set it to 0
-        memset(bias_data, 0, sizeof(float) * this->G * this->OC);
+        memset(bias_data, 0, sizeof(float) * 3 * OC);
      }
      if (origin_mode == false && bias) {
-        for (int64_t i = 0; i < this->OC; ++i) {
+        for (int64_t i = 0; i < OC; ++i) {
          bias_data[i] *= -1;
        }
      }
@@ -210,6 +375,19 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
    }
    return memory_p;
  }
+ private:
+  // RNN dimensions
+  // N - Batch Size
+  // Ti - Max sentence length
+  // IC - Input Channels
+  // OC - Output Channels
+  const int64_t N, Ti, IC, OC;
+  // Memory size of weights, bias and h0 does not depend
+  // on Ti size, thus we need another key to cache them
+  std::string memory_key_;
+  dnnl::primitive_attr attr_;
 };
 template <typename T>

--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/fused/fusion_lstm_op.h"
-#include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
-namespace paddle {
-namespace operators {
-using paddle::framework::LoDTensor;
-using paddle::framework::Tensor;
-using paddle::platform::CPUDeviceContext;
-using paddle::platform::CreateKey;
-using paddle::platform::MKLDNNGetDataType;
-using paddle::platform::MKLDNNMemDesc;
-using platform::to_void_cast;
-template <typename T, typename T_out = T>
-class LSTMMKLDNNHandler
-    : public RNNMKLDNNHandler<T, dnnl::lstm_forward, T_out> {
- public:
-  LSTMMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
-                    const platform::MKLDNNDeviceContext& dev_ctx,
-                    const mkldnn::engine mkldnn_engine,
-                    platform::Place cpu_place, const LoDTensor* input,
-                    const Tensor* weight_h, const Tensor* h0, const Tensor* c0,
-                    const bool is_reverse, const int64_t N, const int64_t Ti,
-                    const int64_t IC, const int64_t OC,
-                    const std::string& unique_name)
-      : RNNMKLDNNHandler<T, dnnl::lstm_forward, T_out>(
-            ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
-            is_reverse, N, Ti, IC, OC, 4,
-            ctx.InputName("X") + ctx.InputName("WeightH")) {
-    if (!this->isCached()) {
-      const bool is_INT8 = std::is_same<T, uint8_t>::value;
-      const bool use_peepholes = ctx.Attr<bool>("use_peepholes");
-      // oneDNN kernel has hardcoded activation functions
-      PADDLE_ENFORCE_EQ(
-          ctx.Attr<std::string>("gate_activation"), "sigmoid",
-          platform::errors::Unimplemented("oneDNN fusion_lstm supports only "
-                                          "sigmoid as a gate activation."));
-      PADDLE_ENFORCE_EQ(
-          ctx.Attr<std::string>("cell_activation"), "tanh",
-          platform::errors::Unimplemented(
-              "oneDNN fusion_lstm supports only tanh as a cell activation."));
-      PADDLE_ENFORCE_EQ(
-          ctx.Attr<std::string>("candidate_activation"), "tanh",
-          platform::errors::Unimplemented(
-              "oneDNN fusion_lstm supports only tanh a candidate activation."));
-      // Weights for int8 kernel are of a type s8
-      const auto weights_dt =
-          is_INT8 ? dnnl::memory::data_type::s8 : MKLDNNGetDataType<T>();
-      // oneDNN RNN dimensions
-      const int64_t D = 1;  // Directions
-      const int64_t L = 1;  // Layers (PP supports only 1 stacked layer)
-      const int64_t G = 4;  // Number of Gates, 4 for LSTM
-      // Create memory descriptors
-      auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
-                                    MKLDNNMemoryFormat::tnc);
-      auto weight_x_md =
-          MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
-      auto weight_h_md =
-          MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
-      auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
-                                   MKLDNNMemoryFormat::ldgo);
-      auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
-                                     MKLDNNMemoryFormat::tnc);
-      auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
-                                 MKLDNNMemoryFormat::ldnc);
-      auto c0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
-                                 MKLDNNMemoryFormat::ldnc);
-      // Create LSTM oneDNN primitive
-      const auto direction =
-          is_reverse ? dnnl::rnn_direction::unidirectional_right2left
-                     : dnnl::rnn_direction::unidirectional_left2right;
-      if (!use_peepholes) {
-        this->AcquireForwardPrimitiveDescriptor(
-            this->attr_, dnnl::prop_kind::forward_inference, direction,
-            input_md, h0_md, c0_md, weight_x_md, weight_h_md, bias_md,
-            hidden_md, dnnl::memory::desc(), dnnl::memory::desc());
-      } else {
-        auto weight_peephole_md =
-            MKLDNNMemDesc({L, D, 3, OC}, MKLDNNGetDataType<float>(),
-                          MKLDNNMemoryFormat::ldgo);
-        this->AcquireForwardPrimitiveDescriptor(
-            this->attr_, dnnl::prop_kind::forward_inference, direction,
-            input_md, h0_md, c0_md, weight_x_md, weight_h_md,
-            weight_peephole_md, bias_md, hidden_md, dnnl::memory::desc(),
-            dnnl::memory::desc());
-      }
-    }
-  }
-  // PaddlePaddle has different order of weights than oneDNN, so a reorder is
-  // needed
-  // PaddlePaddle:  {c, i, f, o}
-  // oneDNN:        {i, f, c, o}
-  void ReorderGates(float* weights, int64_t I) {
-    size_t inner_block_size = this->OC;
-    size_t block_size = inner_block_size * this->G;
-    for (size_t i = 0; i < (size_t)I; ++i) {
-      size_t offset = i * block_size;
-      float* base_pos = weights + offset;
-      std::swap_ranges(base_pos, base_pos + inner_block_size,
-                       base_pos + inner_block_size);  // c <-> i
-      std::swap_ranges(base_pos + inner_block_size,
-                       base_pos + 2 * inner_block_size,
-                       base_pos + 2 * inner_block_size);  // c <-> f
-    }
-  }
-  std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x) {
-    const std::string wx_key = this->memory_key_ + "@weight_x";
-    auto memory_p =
-        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key));
-    if (!memory_p) {
-      auto user_md =
-          MKLDNNMemDesc({1, 1, this->IC, this->G, this->OC},
-                        MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo);
-      auto user_memory = dnnl::memory(user_md, this->engine_);
-      auto* weight_x_data =
-          reinterpret_cast<float*>(user_memory.get_data_handle());
-      memcpy(weight_x_data, weight_x->data<float>(),
-             sizeof(float) * this->IC * this->G * this->OC);
-      ReorderGates(weight_x_data, this->IC);
-      memory_p = std::make_shared<dnnl::memory>(
-          this->fwd_pd_->weights_layer_desc(), this->engine_);
-      dnnl::stream astream(this->engine_);
-      dnnl::reorder(user_memory, *memory_p, this->attr_)
-          .execute(astream, user_memory, *memory_p);
-      this->dev_ctx_.SetBlob(wx_key, memory_p);
-    }
-    return memory_p;
-  }
-  std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h) {
-    const std::string wh_key = this->memory_key_ + "@weight_h";
-    auto memory_p =
-        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key));
-    if (!memory_p) {
-      auto user_md =
-          MKLDNNMemDesc({1, 1, this->OC, this->G, this->OC},
-                        MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo);
-      auto user_memory = dnnl::memory(user_md, this->engine_);
-      auto* weight_h_data =
-          reinterpret_cast<float*>(user_memory.get_data_handle());
-      memcpy(weight_h_data, weight_h->data<float>(),
-             sizeof(float) * this->OC * this->G * this->OC);
-      ReorderGates(weight_h_data, this->OC);
-      memory_p = std::make_shared<dnnl::memory>(
-          this->fwd_pd_->weights_iter_desc(), this->engine_);
-      dnnl::stream astream(this->engine_);
-      dnnl::reorder(user_memory, *memory_p, this->attr_)
-          .execute(astream, user_memory, *memory_p);
-      this->dev_ctx_.SetBlob(wh_key, memory_p);
-    }
-    return memory_p;
-  }
-  std::shared_ptr<dnnl::memory> AcquireBiasMemory(const Tensor* bias) {
-    const std::string bias_key = this->memory_key_ + "@bias";
-    auto memory_p = std::static_pointer_cast<dnnl::memory>(
-        this->dev_ctx_.GetBlob(bias_key));
-    if (!memory_p) {
-      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->bias_desc(),
-                                                this->engine_);
-      auto* bias_data = reinterpret_cast<float*>(memory_p->get_data_handle());
-      if (bias) {
-        const float* user_bias_data =
-            bias->data<float>();  // Bias in oneDNN is always float
-        memcpy(bias_data, user_bias_data, sizeof(float) * this->G * this->OC);
-        ReorderGates(bias_data, 1);
-      } else {
-        // oneDNN always need bias memory, if it's not provided in PP, let
-        // oneDNN allocate memory and set it to 0
-        memset(bias_data, 0, sizeof(float) * this->G * this->OC);
-      }
-      this->dev_ctx_.SetBlob(bias_key, memory_p);
-    }
-    return memory_p;
-  }
-  std::shared_ptr<dnnl::memory> AcquirePeepholeWeights(const Tensor* bias) {
-    const std::string peepholes_key = this->memory_key_ + "@peepholes_weights";
-    auto memory_p = std::static_pointer_cast<dnnl::memory>(
-        this->dev_ctx_.GetBlob(peepholes_key));
-    if (!memory_p) {
-      auto user_md =
-          MKLDNNMemDesc({1, 1, 3, this->OC}, MKLDNNGetDataType<float>(),
-                        MKLDNNMemoryFormat::ldgo);
-      auto user_memory = dnnl::memory(user_md, this->engine_);
-      memory_p = std::make_shared<dnnl::memory>(
-          this->fwd_pd_->weights_peephole_desc(), this->engine_);
-      auto* peephole_weights_data =
-          reinterpret_cast<float*>(memory_p->get_data_handle());
-      const float* user_bias_data =
-          bias->data<float>();  // Bias in oneDNN is always float
-      memcpy(peephole_weights_data, user_bias_data + 4 * this->OC,
-             sizeof(float) * 3 * this->OC);
-      this->dev_ctx_.SetBlob(peepholes_key, memory_p);
-    }
-    return memory_p;
-  }
-  std::shared_ptr<dnnl::memory> AcquireC0Memory(const Tensor* c0) {
-    const std::string c0_key = this->memory_key_ + "@c0";
-    auto memory_p =
-        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(c0_key));
-    if (!memory_p) {
-      auto user_c0_memory = dnnl::memory();
-      if (c0) {
-        user_c0_memory =
-            dnnl::memory({{1, 1, this->N, this->OC},
-                          MKLDNNGetDataType<float>(),
-                          MKLDNNMemoryFormat::ldnc},
-                         this->engine_, to_void_cast(c0->data<float>()));
-      } else {
-        user_c0_memory = dnnl::memory({{1, 1, this->N, this->OC},
-                                       MKLDNNGetDataType<float>(),
-                                       MKLDNNMemoryFormat::ldnc},
-                                      this->engine_);
-        memset(user_c0_memory.get_data_handle(), 0,
-               sizeof(float) * this->N * this->OC);
-      }
-      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
-                                                this->engine_);
-      dnnl::stream astream(this->engine_);
-      dnnl::reorder(user_c0_memory, *memory_p, this->attr_)
-          .execute(astream, user_c0_memory, *memory_p);
-      this->dev_ctx_.SetBlob(c0_key, memory_p);
-    }
-    return memory_p;
-  }
-};
-template <typename T>
-class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    RunKernel<float>(ctx);
-  }
-  template <typename Tout = T>
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-    // Get Tensors
-    const auto* input = ctx.Input<LoDTensor>("X");
-    const auto* h0 = ctx.Input<Tensor>("H0");
-    const auto* c0 = ctx.Input<Tensor>("C0");
-    const auto* weight_x = ctx.Input<Tensor>("WeightX");
-    const auto* weight_h = ctx.Input<Tensor>("WeightH");
-    const auto* bias = ctx.Input<Tensor>("Bias");
-    auto* hidden = ctx.Output<LoDTensor>("Hidden");
-    auto* cell = ctx.Output<LoDTensor>("Cell");
-    cell = cell;
-    auto x_dims = input->dims();
-    auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
-                          ? framework::flatten_to_2d(x_dims, 1)
-                          : x_dims;
-    // Get attributes
-    const bool is_reverse = ctx.Attr<bool>("is_reverse");
-    const bool use_peepholes = ctx.Attr<bool>("use_peepholes");
-    // Get tensor dimensions
-    const auto x_mat_dims_vec = framework::vectorize(x_mat_dims);
-    const auto weight_h_dims = framework::vectorize(weight_h->dims());
-    const auto& input_lod = input->lod()[0];
-    // Calculate RNN dimensions
-    const int64_t N = input_lod.size() - 1;  // Number of sentences (batches)
-    const int64_t Ti =  // Max length of the sentence in a batch
-        [&input_lod]() {
-          size_t res = 0;
-          for (size_t i = 0; i < (input_lod.size() - 1); ++i) {
-            res = std::max(res, input_lod[i + 1] - input_lod[i]);
-          }
-          return res;
-        }();
-    const int64_t IC = x_mat_dims_vec[1];  // Input channels
-    const int64_t OC = weight_h_dims[0];   // Output channels
-    LSTMMKLDNNHandler<T, Tout> handler(
-        ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0, c0,
-        is_reverse, N, Ti, IC, OC,
-        ctx.InputName("X") + ctx.InputName("WeightH"));
-    auto input_memory_p =
-        handler.AcquireInputMemoryWithReorder(input, is_reverse);
-    auto h0_memory_p = handler.AcquireH0Memory(h0);
-    auto c0_memory_p = handler.AcquireC0Memory(c0);
-    auto weight_x_memory_p = handler.AcquireWeightXMemory(weight_x);
-    auto weight_h_memory_p = handler.AcquireWeightHMemory(weight_h);
-    auto bias_memory_p = handler.AcquireBiasMemory(bias);
-    auto hidden_onednn_memory_p = handler.AcquireOutputMemory();
-    std::unordered_map<int, dnnl::memory> lstm_args = {
-        {DNNL_ARG_SRC_LAYER, *input_memory_p},
-        {DNNL_ARG_SRC_ITER, *h0_memory_p},
-        {DNNL_ARG_SRC_ITER_C, *c0_memory_p},
-        {DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
-        {DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
-        {DNNL_ARG_BIAS, *bias_memory_p},
-        {DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
-    if (use_peepholes) {
-      auto peephole_weight_p = handler.AcquirePeepholeWeights(bias);
-      std::pair<int, dnnl::memory> peepholes_weights(DNNL_ARG_WEIGHTS_PEEPHOLE,
-                                                     *peephole_weight_p);
-      lstm_args.insert(peepholes_weights);
-    }
-    auto lstm_forward_p = handler.AcquireForwardPrimitive();
-    dnnl::stream astream(mkldnn_engine);
-    lstm_forward_p->execute(astream, lstm_args);
-    astream.wait();
-    auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle();
-    auto* hidden_data =
-        to_void_cast(hidden->mutable_data<Tout>(ctx.GetPlace()));
-    if (handler.is_NTC()) {
-      handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
-                             is_reverse, platform::RNNReorderType::NTC_PP);
-    } else {
-      handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
-                             is_reverse, platform::RNNReorderType::TNC_PP);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(fusion_lstm, MKLDNN, paddle::platform::CPUPlace,
-                   ops::FusionLSTMMKLDNNKernel<float>);
--- a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-namespace paddle {
-namespace operators {
-using paddle::framework::LoDTensor;
-using paddle::framework::Tensor;
-using paddle::platform::CPUDeviceContext;
-using paddle::platform::CreateKey;
-using paddle::platform::MKLDNNGetDataType;
-using paddle::platform::MKLDNNMemDesc;
-using platform::to_void_cast;
-template <typename T, typename T_alg, typename T_out = T>
-class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
- public:
-  RNNMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
-                   const platform::MKLDNNDeviceContext& dev_ctx,
-                   const mkldnn::engine mkldnn_engine,
-                   platform::Place cpu_place, const LoDTensor* input,
-                   const Tensor* weight_h, const Tensor* h0,
-                   const bool is_reverse, const int64_t N, const int64_t Ti,
-                   const int64_t IC, const int64_t OC, const int64_t G,
-                   const std::string& unique_name)
-      : platform::MKLDNNHandlerT<T, T_alg>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>(), Ti)),
-        N(N),
-        Ti(Ti),
-        IC(IC),
-        OC(OC),
-        G(G) {
-    // Create memory key without Ti because weights, bias and h0 memories
-    // do not depend on Ti size but primitive and input/output memory do
-    memory_key_ = platform::ExtendKeyWithThreadInfoIfNeeded(
-        dev_ctx, CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>()));
-    // Is it int8 kernel
-    const bool is_INT8 = std::is_same<T, uint8_t>::value;
-    if (is_INT8) {
-      // Int8 attributes
-      const float scale_data = ctx.Attr<float>("Scale_data");
-      const float shift_data = ctx.Attr<float>("Shift_data");
-      const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
-      const int weights_scale_mask =
-          0 +
-          (1 << 3)  // bit, indicating the unique scales for `g` dim in `ldigo`
-          +
-          (1 << 4);  // bit, indicating the unique scales for `o` dim in `ldigo`
-      attr_.set_rnn_data_qparams(scale_data, shift_data);
-      attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
-    }
-  }
-  bool is_NTC() {
-    return (platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc()) ==
-            dnnl::memory::format_tag::ntc);
-  }
-  void reorderRNNdata(void* input_data, void* output_data,
-                      std::vector<size_t> lod, const bool is_reverse,
-                      platform::RNNReorderType reorder_type) {
-    switch (reorder_type) {
-      // Reorder input memory [WORDS, C] + LoD -> [N, T, C]
-      case platform::RNNReorderType::PP_NTC: {
-        auto* input_data_iter = reinterpret_cast<T*>(input_data);
-        auto* output_data_iter = reinterpret_cast<T*>(output_data);
-        for (int n = 0; n < N; ++n) {
-          const auto num_elements = (lod[n + 1] - lod[n]) * IC;
-          const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
-          memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
-                 sizeof(T) * num_elements);
-          input_data_iter += num_elements;
-        }
-      } break;
-      // Reorder input memory [WORDS, C] + LoD -> [T, N, C]
-      case platform::RNNReorderType::PP_TNC: {
-        auto* input_data_iter = reinterpret_cast<T*>(input_data);
-        auto* output_data_iter = reinterpret_cast<T*>(output_data);
-        for (int n = 0; n < N; ++n) {
-          const auto num_elements = (lod[n + 1] - lod[n]);
-          const auto offset = is_reverse ? (Ti - num_elements) : 0;
-          for (size_t t = 0; t < num_elements; ++t) {
-            memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
-                   input_data_iter, sizeof(T) * IC);
-            input_data_iter += IC;
-          }
-        }
-      } break;
-      // Reorder output values to PP format [N, T, C] -> [WORDS, C]
-      case platform::RNNReorderType::NTC_PP: {
-        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
-        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
-        for (int n = 0; n < N; ++n) {
-          const auto num_elements = (lod[n + 1] - lod[n]) * OC;
-          const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
-          memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
-                 sizeof(T_out) * num_elements);
-          output_data_iter += num_elements;
-        }
-      } break;
-      // Reorder output values to PP format [T, N, C] -> [WORDS, C]
-      case platform::RNNReorderType::TNC_PP: {
-        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
-        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
-        for (int n = 0; n < N; ++n) {
-          const auto num_elements = lod[n + 1] - lod[n];
-          const auto offset = is_reverse ? (Ti - num_elements) : 0;
-          for (size_t t = 0; t < num_elements; ++t) {
-            memcpy(output_data_iter,
-                   input_data_iter + (t + offset) * N * OC + n * OC,
-                   sizeof(T_out) * OC);
-            output_data_iter += OC;
-          }
-        }
-      } break;
-    }
-  }
-  std::shared_ptr<dnnl::memory> AcquireInputMemoryWithReorder(
-      const LoDTensor* input, const bool is_reverse) {
-    const auto name = this->key_ + "@input_mem";
-    auto memory_p =
-        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
-    if (!memory_p) {
-      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_desc(),
-                                                this->engine_);
-      this->dev_ctx_.SetBlob(name, memory_p);
-    }
-    const auto& input_lod = input->lod()[0];
-    auto* x_data = to_void_cast(input->data<T>());
-    auto* x_onednn_data = memory_p->get_data_handle();
-    memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
-    if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
-        dnnl::memory::format_tag::ntc) {
-      reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
-                     platform::RNNReorderType::PP_NTC);
-    } else {
-      reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
-                     platform::RNNReorderType::PP_TNC);
-    }
-    return memory_p;
-  }
-  std::shared_ptr<dnnl::memory> AcquireOutputMemory() {
-    const auto name = this->key_ + "@output_mem";
-    auto memory_p =
-        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
-    if (!memory_p) {
-      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->dst_desc(),
-                                                this->engine_);
-      this->dev_ctx_.SetBlob(name, memory_p);
-    }
-    return memory_p;
-  }
-  // TODO(grygielski) H0 is for now persistable
-  // TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
-  // not support in yet)
-  std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
-    const std::string h0_key = memory_key_ + "@h0";
-    auto memory_p =
-        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
-    if (!memory_p) {
-      auto user_h0_memory = dnnl::memory();
-      if (h0) {
-        user_h0_memory =
-            dnnl::memory({{1, 1, N, OC},
-                          MKLDNNGetDataType<float>(),
-                          MKLDNNMemoryFormat::ldnc},
-                         this->engine_, to_void_cast(h0->data<float>()));
-      } else {
-        user_h0_memory = dnnl::memory({{1, 1, N, OC},
-                                       MKLDNNGetDataType<float>(),
-                                       MKLDNNMemoryFormat::ldnc},
-                                      this->engine_);
-        memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
-      }
-      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
-                                                this->engine_);
-      dnnl::stream astream(this->engine_);
-      dnnl::reorder(user_h0_memory, *memory_p, attr_)
-          .execute(astream, user_h0_memory, *memory_p);
-      this->dev_ctx_.SetBlob(h0_key, memory_p);
-    }
-    return memory_p;
-  }
- protected:
-  // RNN dimensions
-  // N - Batch Size
-  // Ti - Max sentence length
-  // IC - Input Channels
-  // OC - Output Channels
-  // G  - Number of gates
-  const int64_t N, Ti, IC, OC, G;
-  // Memory size of weights, bias and h0 does not depend
-  // on Ti size, thus we need another key to cache them
-  std::string memory_key_;
-  dnnl::primitive_attr attr_;
-};
-}  // namespace operators
-}  // namespace paddle
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
@@ -75,6 +75,4 @@ class TestFusionGRUMKLDNNOpBS1(TestFusionGRUOp):
 if __name__ == "__main__":
-    from paddle import enable_static
-    enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_mkldnn_op.py
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.test_fusion_lstm_op import TestFusionLSTMOp
-class TestFusionLSTMONEDNNOp(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_mkldnn = True
-    def test_check_output(self):
-        for use_seq in {True, False}:
-            self.attrs['use_seq'] = use_seq
-            self.check_output(check_dygraph=False, no_check_set=["Cell"])
-class TestFusionLSTMONEDNNOpReverse(TestFusionLSTMONEDNNOp):
-    def set_conf(self):
-        self.is_reverse = True
-        self.use_mkldnn = True
-class TestFusionLSTMONEDNNOpInitReverse(TestFusionLSTMONEDNNOp):
-    def set_conf(self):
-        self.has_initial_state = True
-        self.is_reverse = True
-        self.use_mkldnn = True
-class TestFusionLSTMONEDNNOpMD1(TestFusionLSTMONEDNNOp):
-    def set_conf(self):
-        self.M = 36
-        self.D = 8
-        self.use_mkldnn = True
-class TestFusionLSTMONEDNNOpMD2(TestFusionLSTMONEDNNOp):
-    def set_conf(self):
-        self.M = 8
-        self.D = 8
-        self.use_mkldnn = True
-class TestFusionLSTMONEDNNOpMD3(TestFusionLSTMONEDNNOp):
-    def set_conf(self):
-        self.M = 15
-        self.D = 3
-        self.use_mkldnn = True
-class TestFusionLSTMONEDNNOpBS1(TestFusionLSTMONEDNNOp):
-    def set_conf(self):
-        self.lod = [[3]]
-        self.D = 16
-        self.use_mkldnn = True
-class TestFusionLSTMONEDNNOpPeepholesInit(TestFusionLSTMONEDNNOp):
-    def set_conf(self):
-        self.use_peepholes = True
-        self.has_initial_state = True
-        self.use_mkldnn = True
-if __name__ == '__main__':
-    from paddle import enable_static
-    enable_static()
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
@@ -144,6 +144,4 @@ class TestFusionGRUOpBS1(TestFusionGRUOp):
 if __name__ == "__main__":
-    from paddle import enable_static
-    enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -58,7 +58,6 @@ class TestFusionLSTMOp(OpTest):
        self.act_gate = 'sigmoid'
        self.act_cell = 'tanh'
        self.act_cand = 'tanh'
-        self.use_mkldnn = False
        self.set_conf()
        T = sum(self.lod[0])
@@ -111,8 +110,7 @@ class TestFusionLSTMOp(OpTest):
            'is_reverse': self.is_reverse,
            'gate_activation': self.act_gate,
            'cell_activation': self.act_cell,
-            'candidate_activation': self.act_cand,
+            'candidate_activation': self.act_cand
-            'use_mkldnn': self.use_mkldnn
        }
    def test_check_output(self):
@@ -193,6 +191,4 @@ class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
 if __name__ == '__main__':
-    from paddle import enable_static
-    enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -29,5 +29,4 @@ no_check_set_white_list = [
    'update_loss_scaling',
    'cudnn_lstm',
    'rnn',
-    'fusion_lstm',
 ]
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -601,7 +601,6 @@ STATIC_MODE_TESTING_LIST = [
    'test_bilinear_interp_mkldnn_op',
    'test_fusion_gru_int8_mkldnn_op',
    'test_fusion_gru_mkldnn_op',
-    'test_fusion_lstm_mkldnn_op',
    'test_gaussian_random_mkldnn_op',
    'test_lrn_mkldnn_op',
    'test_matmul_mkldnn_op',