Added vanilla LSTM and LSTM with peepholes oneDNN fp32 kernel (#30661)

* added external reorder to profiler * resolved conflict * added enable_static * initial version of lstm, not working yet * added lstm to operators.cmake * added vanilla lstm mkldnn op * added peephole weights integration * minor changes * added formatting * added fusion_lstm_mkldnn to static_whitelist * added formatting * removed comment * moved use_peepholes attribute inside is_cached block * reverted wrong changes * minor formatting change * minor changes

Added vanilla LSTM and LSTM with peepholes oneDNN fp32 kernel (#30661)
* added external reorder to profiler * resolved conflict * added enable_static * initial version of lstm, not working yet * added lstm to operators.cmake * added vanilla lstm mkldnn op * added peephole weights integration * minor changes * added formatting * added fusion_lstm_mkldnn to static_whitelist * added formatting * removed comment * moved use_peepholes attribute inside is_cached block * reverted wrong changes * minor formatting change * minor changes
d834f4e6 · jakpiase · GitHub · 1a13626f · d834f4e6 · d834f4e6
12 changed file
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -197,7 +197,7 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
+"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op"
 "fused_bn_add_activation_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)

--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -14,11 +14,15 @@ register_operators(EXCLUDES
    fused_embedding_eltwise_layernorm_op
    fusion_group_op
    fusion_gru_op
+    fusion_lstm_op
    fused_bn_add_activation_op)
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
-file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_gru);\n")
+op_library(fusion_lstm_op)
+file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_gru);\nUSE_CPU_ONLY_OP(fusion_lstm);\n")
 if (WITH_GPU)
    # fused_bn_activation_op needs cudnn 7.4.1 above

--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -18,6 +18,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -145,8 +148,17 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+#ifdef PADDLE_WITH_MKLDNN
+  if (this->CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
+#endif
  return framework::OpKernelType(
-      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context());
+      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout,
+      library);
 }
 void FusionLSTMOpMaker::Make() {
@@ -235,6 +247,9 @@ void FusionLSTMOpMaker::Make() {
                       "`tanh` by default.")
      .SetDefault("tanh")
      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
  AddComment(R"DOC(
 Fusion Long-Short Term Memory (LSTM) Operator.
 This operator fuse the X into LSTM, more details can refer to LSTM op.

--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_gru_op.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
 namespace paddle {
 namespace operators {
@@ -27,7 +27,7 @@ using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;
 template <typename T, typename T_out = T>
-class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
+class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
 public:
  GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
                   const platform::MKLDNNDeviceContext& dev_ctx,
@@ -37,37 +37,12 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
                   const bool is_reverse, const int64_t N, const int64_t Ti,
                   const int64_t IC, const int64_t OC,
                   const std::string& unique_name)
-      : platform::MKLDNNHandlerT<T, dnnl::gru_forward>(
+      : RNNMKLDNNHandler<T, dnnl::gru_forward, T_out>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
+            ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
-            CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>(), Ti)),
+            is_reverse, N, Ti, IC, OC, 3,
-        N(N),
+            ctx.InputName("X") + ctx.InputName("WeightH")) {
-        Ti(Ti),
-        IC(IC),
-        OC(OC) {
-    // Create memory key without Ti because weights, bias and h0 memories
-    // do not depend on Ti size but primitive and input/output memory do
-    memory_key_ = platform::ExtendKeyWithThreadInfoIfNeeded(
-        dev_ctx, CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>()));
-    // Is it int8 kernel
    const bool is_INT8 = std::is_same<T, uint8_t>::value;
-    if (is_INT8) {
-      // Int8 attributes
-      const float scale_data = ctx.Attr<float>("Scale_data");
-      const float shift_data = ctx.Attr<float>("Shift_data");
-      const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
-      const int weights_scale_mask =
-          0 +
-          (1 << 3)  // bit, indicating the unique scales for `g` dim in `ldigo`
-          +
-          (1 << 4);  // bit, indicating the unique scales for `o` dim in `ldigo`
-      attr_.set_rnn_data_qparams(scale_data, shift_data);
-      attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
-    }
    if (!this->isCached()) {
      // oneDNN kernel has hardcoded activation functions
      PADDLE_ENFORCE_EQ(
@@ -108,176 +83,35 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
                     : dnnl::rnn_direction::unidirectional_left2right;
      this->AcquireForwardPrimitiveDescriptor(
-          attr_, dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
+          this->attr_, dnnl::prop_kind::forward_inference, direction, input_md,
-          weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc());
+          h0_md, weight_x_md, weight_h_md, bias_md, hidden_md,
-    }
+          dnnl::memory::desc());
    }
-  bool is_NTC() {
-    return (platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc()) ==
-            dnnl::memory::format_tag::ntc);
-  }
-  void reorderRNNdata(void* input_data, void* output_data,
-                      std::vector<size_t> lod, const bool is_reverse,
-                      platform::RNNReorderType reorder_type) {
-    switch (reorder_type) {
-      // Reorder input memory [WORDS, C] + LoD -> [N, T, C]
-      case platform::RNNReorderType::PP_NTC: {
-        auto* input_data_iter = reinterpret_cast<T*>(input_data);
-        auto* output_data_iter = reinterpret_cast<T*>(output_data);
-        for (int n = 0; n < N; ++n) {
-          const auto num_elements = (lod[n + 1] - lod[n]) * IC;
-          const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
-          memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
-                 sizeof(T) * num_elements);
-          input_data_iter += num_elements;
-        }
-      } break;
-      // Reorder input memory [WORDS, C] + LoD -> [T, N, C]
-      case platform::RNNReorderType::PP_TNC: {
-        auto* input_data_iter = reinterpret_cast<T*>(input_data);
-        auto* output_data_iter = reinterpret_cast<T*>(output_data);
-        for (int n = 0; n < N; ++n) {
-          const auto num_elements = (lod[n + 1] - lod[n]);
-          const auto offset = is_reverse ? (Ti - num_elements) : 0;
-          for (size_t t = 0; t < num_elements; ++t) {
-            memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
-                   input_data_iter, sizeof(T) * IC);
-            input_data_iter += IC;
-          }
-        }
-      } break;
-      // Reorder output values to PP format [N, T, C] -> [WORDS, C]
-      case platform::RNNReorderType::NTC_PP: {
-        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
-        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
-        for (int n = 0; n < N; ++n) {
-          const auto num_elements = (lod[n + 1] - lod[n]) * OC;
-          const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
-          memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
-                 sizeof(T_out) * num_elements);
-          output_data_iter += num_elements;
-        }
-      } break;
-      // Reorder output values to PP format [T, N, C] -> [WORDS, C]
-      case platform::RNNReorderType::TNC_PP: {
-        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
-        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
-        for (int n = 0; n < N; ++n) {
-          const auto num_elements = lod[n + 1] - lod[n];
-          const auto offset = is_reverse ? (Ti - num_elements) : 0;
-          for (size_t t = 0; t < num_elements; ++t) {
-            memcpy(output_data_iter,
-                   input_data_iter + (t + offset) * N * OC + n * OC,
-                   sizeof(T_out) * OC);
-            output_data_iter += OC;
-          }
-        }
-      } break;
-    }
-  }
-  std::shared_ptr<dnnl::memory> AcquireInputMemoryWithReorder(
-      const LoDTensor* input, const bool is_reverse) {
-    const auto name = this->key_ + "@input_mem";
-    auto memory_p =
-        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
-    if (!memory_p) {
-      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_desc(),
-                                                this->engine_);
-      this->dev_ctx_.SetBlob(name, memory_p);
-    }
-    const auto& input_lod = input->lod()[0];
-    auto* x_data = to_void_cast(input->data<T>());
-    auto* x_onednn_data = memory_p->get_data_handle();
-    memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
-    if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
-        dnnl::memory::format_tag::ntc) {
-      reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
-                     platform::RNNReorderType::PP_NTC);
-    } else {
-      reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
-                     platform::RNNReorderType::PP_TNC);
-    }
-    return memory_p;
-  }
-  std::shared_ptr<dnnl::memory> AcquireOutputMemory() {
-    const auto name = this->key_ + "@output_mem";
-    auto memory_p =
-        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
-    if (!memory_p) {
-      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->dst_desc(),
-                                                this->engine_);
-      this->dev_ctx_.SetBlob(name, memory_p);
-    }
-    return memory_p;
-  }
-  // TODO(grygielski) H0 is for now persistable
-  // TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
-  // not support in yet)
-  std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
-    const std::string h0_key = memory_key_ + "@h0";
-    auto memory_p =
-        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
-    if (!memory_p) {
-      auto user_h0_memory = dnnl::memory();
-      if (h0) {
-        user_h0_memory =
-            dnnl::memory({{1, 1, N, OC},
-                          MKLDNNGetDataType<float>(),
-                          MKLDNNMemoryFormat::ldnc},
-                         this->engine_, to_void_cast(h0->data<float>()));
-      } else {
-        user_h0_memory = dnnl::memory({{1, 1, N, OC},
-                                       MKLDNNGetDataType<float>(),
-                                       MKLDNNMemoryFormat::ldnc},
-                                      this->engine_);
-        memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
-      }
-      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
-                                                this->engine_);
-      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-      dnnl::reorder(user_h0_memory, *memory_p, attr_)
-          .execute(astream, user_h0_memory, *memory_p);
-      this->dev_ctx_.SetBlob(h0_key, memory_p);
-    }
-    return memory_p;
  }
  std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x,
                                                     const bool origin_mode) {
-    const std::string wx_key = memory_key_ + "@weight_x";
+    const std::string wx_key = this->memory_key_ + "@weight_x";
    auto memory_p =
        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key));
    if (!memory_p) {
      auto user_md =
-          MKLDNNMemDesc({1, 1, IC, 3, OC}, MKLDNNGetDataType<float>(),
+          MKLDNNMemDesc({1, 1, this->IC, this->G, this->OC},
-                        MKLDNNMemoryFormat::ldigo);
+                        MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo);
      auto user_memory = dnnl::memory(user_md, this->engine_);
      auto* weight_x_data =
          reinterpret_cast<float*>(user_memory.get_data_handle());
      memcpy(weight_x_data, weight_x->data<float>(),
-             sizeof(float) * IC * 3 * OC);
+             sizeof(float) * this->IC * this->G * this->OC);
      if (origin_mode == false) {
-        for (int64_t i = 0; i < IC; ++i) {
+        for (int64_t i = 0; i < this->IC; ++i) {
-          for (int64_t j = 0; j < OC; ++j) {
+          for (int64_t j = 0; j < this->OC; ++j) {
            weight_x_data[j] *= -1;
          }
-          weight_x_data += 3 * OC;
+          weight_x_data += 3 * this->OC;
        }
      }
@@ -285,7 +119,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
          this->fwd_pd_->weights_layer_desc(), this->engine_);
      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-      dnnl::reorder(user_memory, *memory_p, attr_)
+      dnnl::reorder(user_memory, *memory_p, this->attr_)
          .execute(astream, user_memory, *memory_p);
      this->dev_ctx_.SetBlob(wx_key, memory_p);
@@ -295,14 +129,14 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
  std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h,
                                                     const bool origin_mode) {
-    const std::string wh_key = memory_key_ + "@weight_h";
+    const std::string wh_key = this->memory_key_ + "@weight_h";
    auto memory_p =
        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key));
    if (!memory_p) {
      auto user_md =
-          MKLDNNMemDesc({1, 1, OC, 3, OC}, MKLDNNGetDataType<float>(),
+          MKLDNNMemDesc({1, 1, this->OC, this->G, this->OC},
-                        MKLDNNMemoryFormat::ldigo);
+                        MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo);
      auto user_memory = dnnl::memory(user_md, this->engine_);
      // Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to
@@ -312,25 +146,26 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
      auto* user_weight_h_data = weight_h->data<float>();
      auto src1_iter = user_weight_h_data;
-      auto src2_iter = user_weight_h_data + 2 * OC * OC;
+      auto src2_iter = user_weight_h_data + 2 * this->OC * this->OC;
-      for (int64_t c = 0; c < OC; ++c) {
+      for (int64_t c = 0; c < this->OC; ++c) {
-        memcpy(weight_h_data, src1_iter, 2 * OC * sizeof(float));
+        memcpy(weight_h_data, src1_iter, 2 * this->OC * sizeof(float));
-        memcpy(weight_h_data + 2 * OC, src2_iter, OC * sizeof(float));
+        memcpy(weight_h_data + 2 * this->OC, src2_iter,
+               this->OC * sizeof(float));
-        src1_iter += 2 * OC;
+        src1_iter += 2 * this->OC;
-        src2_iter += OC;
+        src2_iter += this->OC;
-        weight_h_data += 3 * OC;
+        weight_h_data += 3 * this->OC;
      }
      weight_h_data = reinterpret_cast<float*>(user_memory.get_data_handle());
      if (origin_mode == false) {
-        for (int64_t i = 0; i < OC; ++i) {
+        for (int64_t i = 0; i < this->OC; ++i) {
-          for (int64_t j = 0; j < OC; ++j) {
+          for (int64_t j = 0; j < this->OC; ++j) {
            weight_h_data[j] *= -1;
          }
-          weight_h_data += 3 * OC;
+          weight_h_data += 3 * this->OC;
        }
      }
@@ -338,7 +173,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
          this->fwd_pd_->weights_iter_desc(), this->engine_);
      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-      dnnl::reorder(user_memory, *memory_p, attr_)
+      dnnl::reorder(user_memory, *memory_p, this->attr_)
          .execute(astream, user_memory, *memory_p);
      this->dev_ctx_.SetBlob(wh_key, memory_p);
@@ -348,7 +183,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
  std::shared_ptr<dnnl::memory> AcquireBiasMemory(const Tensor* bias,
                                                  const bool origin_mode) {
-    const std::string bias_key = memory_key_ + "@bias";
+    const std::string bias_key = this->memory_key_ + "@bias";
    auto memory_p = std::static_pointer_cast<dnnl::memory>(
        this->dev_ctx_.GetBlob(bias_key));
@@ -359,15 +194,15 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
      if (bias) {
        const float* user_bias_data =
            bias->data<float>();  // Bias in oneDNN is always float
-        memcpy(bias_data, user_bias_data, sizeof(float) * 3 * OC);
+        memcpy(bias_data, user_bias_data, sizeof(float) * this->G * this->OC);
      } else {
        // oneDNN always need bias memory, if it's not provided in PP, let
        // oneDNN allocate memory and set it to 0
-        memset(bias_data, 0, sizeof(float) * 3 * OC);
+        memset(bias_data, 0, sizeof(float) * this->G * this->OC);
      }
      if (origin_mode == false && bias) {
-        for (int64_t i = 0; i < OC; ++i) {
+        for (int64_t i = 0; i < this->OC; ++i) {
          bias_data[i] *= -1;
        }
      }
@@ -375,19 +210,6 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
    }
    return memory_p;
  }
- private:
-  // RNN dimensions
-  // N - Batch Size
-  // Ti - Max sentence length
-  // IC - Input Channels
-  // OC - Output Channels
-  const int64_t N, Ti, IC, OC;
-  // Memory size of weights, bias and h0 does not depend
-  // on Ti size, thus we need another key to cache them
-  std::string memory_key_;
-  dnnl::primitive_attr attr_;
 };
 template <typename T>

--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/fused/fusion_lstm_op.h"
+#include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
+namespace paddle {
+namespace operators {
+using paddle::framework::LoDTensor;
+using paddle::framework::Tensor;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::CreateKey;
+using paddle::platform::MKLDNNGetDataType;
+using paddle::platform::MKLDNNMemDesc;
+using platform::to_void_cast;
+template <typename T, typename T_out = T>
+class LSTMMKLDNNHandler
+    : public RNNMKLDNNHandler<T, dnnl::lstm_forward, T_out> {
+ public:
+  LSTMMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
+                    const platform::MKLDNNDeviceContext& dev_ctx,
+                    const mkldnn::engine mkldnn_engine,
+                    platform::Place cpu_place, const LoDTensor* input,
+                    const Tensor* weight_h, const Tensor* h0, const Tensor* c0,
+                    const bool is_reverse, const int64_t N, const int64_t Ti,
+                    const int64_t IC, const int64_t OC,
+                    const std::string& unique_name)
+      : RNNMKLDNNHandler<T, dnnl::lstm_forward, T_out>(
+            ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
+            is_reverse, N, Ti, IC, OC, 4,
+            ctx.InputName("X") + ctx.InputName("WeightH")) {
+    if (!this->isCached()) {
+      const bool is_INT8 = std::is_same<T, uint8_t>::value;
+      const bool use_peepholes = ctx.Attr<bool>("use_peepholes");
+      // oneDNN kernel has hardcoded activation functions
+      PADDLE_ENFORCE_EQ(
+          ctx.Attr<std::string>("gate_activation"), "sigmoid",
+          platform::errors::Unimplemented("oneDNN fusion_lstm supports only "
+                                          "sigmoid as a gate activation."));
+      PADDLE_ENFORCE_EQ(
+          ctx.Attr<std::string>("cell_activation"), "tanh",
+          platform::errors::Unimplemented(
+              "oneDNN fusion_lstm supports only tanh as a cell activation."));
+      PADDLE_ENFORCE_EQ(
+          ctx.Attr<std::string>("candidate_activation"), "tanh",
+          platform::errors::Unimplemented(
+              "oneDNN fusion_lstm supports only tanh a candidate activation."));
+      // Weights for int8 kernel are of a type s8
+      const auto weights_dt =
+          is_INT8 ? dnnl::memory::data_type::s8 : MKLDNNGetDataType<T>();
+      // oneDNN RNN dimensions
+      const int64_t D = 1;  // Directions
+      const int64_t L = 1;  // Layers (PP supports only 1 stacked layer)
+      const int64_t G = 4;  // Number of Gates, 4 for LSTM
+      // Create memory descriptors
+      auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
+                                    MKLDNNMemoryFormat::tnc);
+      auto weight_x_md =
+          MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
+      auto weight_h_md =
+          MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
+      auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
+                                   MKLDNNMemoryFormat::ldgo);
+      auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
+                                     MKLDNNMemoryFormat::tnc);
+      auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
+                                 MKLDNNMemoryFormat::ldnc);
+      auto c0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
+                                 MKLDNNMemoryFormat::ldnc);
+      // Create LSTM oneDNN primitive
+      const auto direction =
+          is_reverse ? dnnl::rnn_direction::unidirectional_right2left
+                     : dnnl::rnn_direction::unidirectional_left2right;
+      if (!use_peepholes) {
+        this->AcquireForwardPrimitiveDescriptor(
+            this->attr_, dnnl::prop_kind::forward_inference, direction,
+            input_md, h0_md, c0_md, weight_x_md, weight_h_md, bias_md,
+            hidden_md, dnnl::memory::desc(), dnnl::memory::desc());
+      } else {
+        auto weight_peephole_md =
+            MKLDNNMemDesc({L, D, 3, OC}, MKLDNNGetDataType<float>(),
+                          MKLDNNMemoryFormat::ldgo);
+        this->AcquireForwardPrimitiveDescriptor(
+            this->attr_, dnnl::prop_kind::forward_inference, direction,
+            input_md, h0_md, c0_md, weight_x_md, weight_h_md,
+            weight_peephole_md, bias_md, hidden_md, dnnl::memory::desc(),
+            dnnl::memory::desc());
+      }
+    }
+  }
+  // PaddlePaddle has different order of weights than oneDNN, so a reorder is
+  // needed
+  // PaddlePaddle:  {c, i, f, o}
+  // oneDNN:        {i, f, c, o}
+  void ReorderGates(float* weights, int64_t I) {
+    size_t inner_block_size = this->OC;
+    size_t block_size = inner_block_size * this->G;
+    for (size_t i = 0; i < (size_t)I; ++i) {
+      size_t offset = i * block_size;
+      float* base_pos = weights + offset;
+      std::swap_ranges(base_pos, base_pos + inner_block_size,
+                       base_pos + inner_block_size);  // c <-> i
+      std::swap_ranges(base_pos + inner_block_size,
+                       base_pos + 2 * inner_block_size,
+                       base_pos + 2 * inner_block_size);  // c <-> f
+    }
+  }
+  std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x) {
+    const std::string wx_key = this->memory_key_ + "@weight_x";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key));
+    if (!memory_p) {
+      auto user_md =
+          MKLDNNMemDesc({1, 1, this->IC, this->G, this->OC},
+                        MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo);
+      auto user_memory = dnnl::memory(user_md, this->engine_);
+      auto* weight_x_data =
+          reinterpret_cast<float*>(user_memory.get_data_handle());
+      memcpy(weight_x_data, weight_x->data<float>(),
+             sizeof(float) * this->IC * this->G * this->OC);
+      ReorderGates(weight_x_data, this->IC);
+      memory_p = std::make_shared<dnnl::memory>(
+          this->fwd_pd_->weights_layer_desc(), this->engine_);
+      dnnl::stream astream(this->engine_);
+      dnnl::reorder(user_memory, *memory_p, this->attr_)
+          .execute(astream, user_memory, *memory_p);
+      this->dev_ctx_.SetBlob(wx_key, memory_p);
+    }
+    return memory_p;
+  }
+  std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h) {
+    const std::string wh_key = this->memory_key_ + "@weight_h";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key));
+    if (!memory_p) {
+      auto user_md =
+          MKLDNNMemDesc({1, 1, this->OC, this->G, this->OC},
+                        MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo);
+      auto user_memory = dnnl::memory(user_md, this->engine_);
+      auto* weight_h_data =
+          reinterpret_cast<float*>(user_memory.get_data_handle());
+      memcpy(weight_h_data, weight_h->data<float>(),
+             sizeof(float) * this->OC * this->G * this->OC);
+      ReorderGates(weight_h_data, this->OC);
+      memory_p = std::make_shared<dnnl::memory>(
+          this->fwd_pd_->weights_iter_desc(), this->engine_);
+      dnnl::stream astream(this->engine_);
+      dnnl::reorder(user_memory, *memory_p, this->attr_)
+          .execute(astream, user_memory, *memory_p);
+      this->dev_ctx_.SetBlob(wh_key, memory_p);
+    }
+    return memory_p;
+  }
+  std::shared_ptr<dnnl::memory> AcquireBiasMemory(const Tensor* bias) {
+    const std::string bias_key = this->memory_key_ + "@bias";
+    auto memory_p = std::static_pointer_cast<dnnl::memory>(
+        this->dev_ctx_.GetBlob(bias_key));
+    if (!memory_p) {
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->bias_desc(),
+                                                this->engine_);
+      auto* bias_data = reinterpret_cast<float*>(memory_p->get_data_handle());
+      if (bias) {
+        const float* user_bias_data =
+            bias->data<float>();  // Bias in oneDNN is always float
+        memcpy(bias_data, user_bias_data, sizeof(float) * this->G * this->OC);
+        ReorderGates(bias_data, 1);
+      } else {
+        // oneDNN always need bias memory, if it's not provided in PP, let
+        // oneDNN allocate memory and set it to 0
+        memset(bias_data, 0, sizeof(float) * this->G * this->OC);
+      }
+      this->dev_ctx_.SetBlob(bias_key, memory_p);
+    }
+    return memory_p;
+  }
+  std::shared_ptr<dnnl::memory> AcquirePeepholeWeights(const Tensor* bias) {
+    const std::string peepholes_key = this->memory_key_ + "@peepholes_weights";
+    auto memory_p = std::static_pointer_cast<dnnl::memory>(
+        this->dev_ctx_.GetBlob(peepholes_key));
+    if (!memory_p) {
+      auto user_md =
+          MKLDNNMemDesc({1, 1, 3, this->OC}, MKLDNNGetDataType<float>(),
+                        MKLDNNMemoryFormat::ldgo);
+      auto user_memory = dnnl::memory(user_md, this->engine_);
+      memory_p = std::make_shared<dnnl::memory>(
+          this->fwd_pd_->weights_peephole_desc(), this->engine_);
+      auto* peephole_weights_data =
+          reinterpret_cast<float*>(memory_p->get_data_handle());
+      const float* user_bias_data =
+          bias->data<float>();  // Bias in oneDNN is always float
+      memcpy(peephole_weights_data, user_bias_data + 4 * this->OC,
+             sizeof(float) * 3 * this->OC);
+      this->dev_ctx_.SetBlob(peepholes_key, memory_p);
+    }
+    return memory_p;
+  }
+  std::shared_ptr<dnnl::memory> AcquireC0Memory(const Tensor* c0) {
+    const std::string c0_key = this->memory_key_ + "@c0";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(c0_key));
+    if (!memory_p) {
+      auto user_c0_memory = dnnl::memory();
+      if (c0) {
+        user_c0_memory =
+            dnnl::memory({{1, 1, this->N, this->OC},
+                          MKLDNNGetDataType<float>(),
+                          MKLDNNMemoryFormat::ldnc},
+                         this->engine_, to_void_cast(c0->data<float>()));
+      } else {
+        user_c0_memory = dnnl::memory({{1, 1, this->N, this->OC},
+                                       MKLDNNGetDataType<float>(),
+                                       MKLDNNMemoryFormat::ldnc},
+                                      this->engine_);
+        memset(user_c0_memory.get_data_handle(), 0,
+               sizeof(float) * this->N * this->OC);
+      }
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
+                                                this->engine_);
+      dnnl::stream astream(this->engine_);
+      dnnl::reorder(user_c0_memory, *memory_p, this->attr_)
+          .execute(astream, user_c0_memory, *memory_p);
+      this->dev_ctx_.SetBlob(c0_key, memory_p);
+    }
+    return memory_p;
+  }
+};
+template <typename T>
+class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    RunKernel<float>(ctx);
+  }
+  template <typename Tout = T>
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    // Get Tensors
+    const auto* input = ctx.Input<LoDTensor>("X");
+    const auto* h0 = ctx.Input<Tensor>("H0");
+    const auto* c0 = ctx.Input<Tensor>("C0");
+    const auto* weight_x = ctx.Input<Tensor>("WeightX");
+    const auto* weight_h = ctx.Input<Tensor>("WeightH");
+    const auto* bias = ctx.Input<Tensor>("Bias");
+    auto* hidden = ctx.Output<LoDTensor>("Hidden");
+    auto* cell = ctx.Output<LoDTensor>("Cell");
+    cell = cell;
+    auto x_dims = input->dims();
+    auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
+                          ? framework::flatten_to_2d(x_dims, 1)
+                          : x_dims;
+    // Get attributes
+    const bool is_reverse = ctx.Attr<bool>("is_reverse");
+    const bool use_peepholes = ctx.Attr<bool>("use_peepholes");
+    // Get tensor dimensions
+    const auto x_mat_dims_vec = framework::vectorize(x_mat_dims);
+    const auto weight_h_dims = framework::vectorize(weight_h->dims());
+    const auto& input_lod = input->lod()[0];
+    // Calculate RNN dimensions
+    const int64_t N = input_lod.size() - 1;  // Number of sentences (batches)
+    const int64_t Ti =  // Max length of the sentence in a batch
+        [&input_lod]() {
+          size_t res = 0;
+          for (size_t i = 0; i < (input_lod.size() - 1); ++i) {
+            res = std::max(res, input_lod[i + 1] - input_lod[i]);
+          }
+          return res;
+        }();
+    const int64_t IC = x_mat_dims_vec[1];  // Input channels
+    const int64_t OC = weight_h_dims[0];   // Output channels
+    LSTMMKLDNNHandler<T, Tout> handler(
+        ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0, c0,
+        is_reverse, N, Ti, IC, OC,
+        ctx.InputName("X") + ctx.InputName("WeightH"));
+    auto input_memory_p =
+        handler.AcquireInputMemoryWithReorder(input, is_reverse);
+    auto h0_memory_p = handler.AcquireH0Memory(h0);
+    auto c0_memory_p = handler.AcquireC0Memory(c0);
+    auto weight_x_memory_p = handler.AcquireWeightXMemory(weight_x);
+    auto weight_h_memory_p = handler.AcquireWeightHMemory(weight_h);
+    auto bias_memory_p = handler.AcquireBiasMemory(bias);
+    auto hidden_onednn_memory_p = handler.AcquireOutputMemory();
+    std::unordered_map<int, dnnl::memory> lstm_args = {
+        {DNNL_ARG_SRC_LAYER, *input_memory_p},
+        {DNNL_ARG_SRC_ITER, *h0_memory_p},
+        {DNNL_ARG_SRC_ITER_C, *c0_memory_p},
+        {DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
+        {DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
+        {DNNL_ARG_BIAS, *bias_memory_p},
+        {DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
+    if (use_peepholes) {
+      auto peephole_weight_p = handler.AcquirePeepholeWeights(bias);
+      std::pair<int, dnnl::memory> peepholes_weights(DNNL_ARG_WEIGHTS_PEEPHOLE,
+                                                     *peephole_weight_p);
+      lstm_args.insert(peepholes_weights);
+    }
+    auto lstm_forward_p = handler.AcquireForwardPrimitive();
+    dnnl::stream astream(mkldnn_engine);
+    lstm_forward_p->execute(astream, lstm_args);
+    astream.wait();
+    auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle();
+    auto* hidden_data =
+        to_void_cast(hidden->mutable_data<Tout>(ctx.GetPlace()));
+    if (handler.is_NTC()) {
+      handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
+                             is_reverse, platform::RNNReorderType::NTC_PP);
+    } else {
+      handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
+                             is_reverse, platform::RNNReorderType::TNC_PP);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(fusion_lstm, MKLDNN, paddle::platform::CPUPlace,
+                   ops::FusionLSTMMKLDNNKernel<float>);
--- a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+namespace paddle {
+namespace operators {
+using paddle::framework::LoDTensor;
+using paddle::framework::Tensor;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::CreateKey;
+using paddle::platform::MKLDNNGetDataType;
+using paddle::platform::MKLDNNMemDesc;
+using platform::to_void_cast;
+template <typename T, typename T_alg, typename T_out = T>
+class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
+ public:
+  RNNMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
+                   const platform::MKLDNNDeviceContext& dev_ctx,
+                   const mkldnn::engine mkldnn_engine,
+                   platform::Place cpu_place, const LoDTensor* input,
+                   const Tensor* weight_h, const Tensor* h0,
+                   const bool is_reverse, const int64_t N, const int64_t Ti,
+                   const int64_t IC, const int64_t OC, const int64_t G,
+                   const std::string& unique_name)
+      : platform::MKLDNNHandlerT<T, T_alg>(
+            dev_ctx, dev_ctx.GetEngine(), cpu_place,
+            CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>(), Ti)),
+        N(N),
+        Ti(Ti),
+        IC(IC),
+        OC(OC),
+        G(G) {
+    // Create memory key without Ti because weights, bias and h0 memories
+    // do not depend on Ti size but primitive and input/output memory do
+    memory_key_ = platform::ExtendKeyWithThreadInfoIfNeeded(
+        dev_ctx, CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>()));
+    // Is it int8 kernel
+    const bool is_INT8 = std::is_same<T, uint8_t>::value;
+    if (is_INT8) {
+      // Int8 attributes
+      const float scale_data = ctx.Attr<float>("Scale_data");
+      const float shift_data = ctx.Attr<float>("Shift_data");
+      const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
+      const int weights_scale_mask =
+          0 +
+          (1 << 3)  // bit, indicating the unique scales for `g` dim in `ldigo`
+          +
+          (1 << 4);  // bit, indicating the unique scales for `o` dim in `ldigo`
+      attr_.set_rnn_data_qparams(scale_data, shift_data);
+      attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
+    }
+  }
+  bool is_NTC() {
+    return (platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc()) ==
+            dnnl::memory::format_tag::ntc);
+  }
+  void reorderRNNdata(void* input_data, void* output_data,
+                      std::vector<size_t> lod, const bool is_reverse,
+                      platform::RNNReorderType reorder_type) {
+    switch (reorder_type) {
+      // Reorder input memory [WORDS, C] + LoD -> [N, T, C]
+      case platform::RNNReorderType::PP_NTC: {
+        auto* input_data_iter = reinterpret_cast<T*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T*>(output_data);
+        for (int n = 0; n < N; ++n) {
+          const auto num_elements = (lod[n + 1] - lod[n]) * IC;
+          const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
+          memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
+                 sizeof(T) * num_elements);
+          input_data_iter += num_elements;
+        }
+      } break;
+      // Reorder input memory [WORDS, C] + LoD -> [T, N, C]
+      case platform::RNNReorderType::PP_TNC: {
+        auto* input_data_iter = reinterpret_cast<T*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T*>(output_data);
+        for (int n = 0; n < N; ++n) {
+          const auto num_elements = (lod[n + 1] - lod[n]);
+          const auto offset = is_reverse ? (Ti - num_elements) : 0;
+          for (size_t t = 0; t < num_elements; ++t) {
+            memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
+                   input_data_iter, sizeof(T) * IC);
+            input_data_iter += IC;
+          }
+        }
+      } break;
+      // Reorder output values to PP format [N, T, C] -> [WORDS, C]
+      case platform::RNNReorderType::NTC_PP: {
+        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
+        for (int n = 0; n < N; ++n) {
+          const auto num_elements = (lod[n + 1] - lod[n]) * OC;
+          const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
+          memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
+                 sizeof(T_out) * num_elements);
+          output_data_iter += num_elements;
+        }
+      } break;
+      // Reorder output values to PP format [T, N, C] -> [WORDS, C]
+      case platform::RNNReorderType::TNC_PP: {
+        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
+        for (int n = 0; n < N; ++n) {
+          const auto num_elements = lod[n + 1] - lod[n];
+          const auto offset = is_reverse ? (Ti - num_elements) : 0;
+          for (size_t t = 0; t < num_elements; ++t) {
+            memcpy(output_data_iter,
+                   input_data_iter + (t + offset) * N * OC + n * OC,
+                   sizeof(T_out) * OC);
+            output_data_iter += OC;
+          }
+        }
+      } break;
+    }
+  }
+  std::shared_ptr<dnnl::memory> AcquireInputMemoryWithReorder(
+      const LoDTensor* input, const bool is_reverse) {
+    const auto name = this->key_ + "@input_mem";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
+    if (!memory_p) {
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_desc(),
+                                                this->engine_);
+      this->dev_ctx_.SetBlob(name, memory_p);
+    }
+    const auto& input_lod = input->lod()[0];
+    auto* x_data = to_void_cast(input->data<T>());
+    auto* x_onednn_data = memory_p->get_data_handle();
+    memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
+    if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
+        dnnl::memory::format_tag::ntc) {
+      reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
+                     platform::RNNReorderType::PP_NTC);
+    } else {
+      reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
+                     platform::RNNReorderType::PP_TNC);
+    }
+    return memory_p;
+  }
+  std::shared_ptr<dnnl::memory> AcquireOutputMemory() {
+    const auto name = this->key_ + "@output_mem";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
+    if (!memory_p) {
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->dst_desc(),
+                                                this->engine_);
+      this->dev_ctx_.SetBlob(name, memory_p);
+    }
+    return memory_p;
+  }
+  // TODO(grygielski) H0 is for now persistable
+  // TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
+  // not support in yet)
+  std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
+    const std::string h0_key = memory_key_ + "@h0";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
+    if (!memory_p) {
+      auto user_h0_memory = dnnl::memory();
+      if (h0) {
+        user_h0_memory =
+            dnnl::memory({{1, 1, N, OC},
+                          MKLDNNGetDataType<float>(),
+                          MKLDNNMemoryFormat::ldnc},
+                         this->engine_, to_void_cast(h0->data<float>()));
+      } else {
+        user_h0_memory = dnnl::memory({{1, 1, N, OC},
+                                       MKLDNNGetDataType<float>(),
+                                       MKLDNNMemoryFormat::ldnc},
+                                      this->engine_);
+        memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
+      }
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
+                                                this->engine_);
+      dnnl::stream astream(this->engine_);
+      dnnl::reorder(user_h0_memory, *memory_p, attr_)
+          .execute(astream, user_h0_memory, *memory_p);
+      this->dev_ctx_.SetBlob(h0_key, memory_p);
+    }
+    return memory_p;
+  }
+ protected:
+  // RNN dimensions
+  // N - Batch Size
+  // Ti - Max sentence length
+  // IC - Input Channels
+  // OC - Output Channels
+  // G  - Number of gates
+  const int64_t N, Ti, IC, OC, G;
+  // Memory size of weights, bias and h0 does not depend
+  // on Ti size, thus we need another key to cache them
+  std::string memory_key_;
+  dnnl::primitive_attr attr_;
+};
+}  // namespace operators
+}  // namespace paddle
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
@@ -75,4 +75,6 @@ class TestFusionGRUMKLDNNOpBS1(TestFusionGRUOp):
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_mkldnn_op.py
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.test_fusion_lstm_op import TestFusionLSTMOp
+class TestFusionLSTMONEDNNOp(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_mkldnn = True
+    def test_check_output(self):
+        for use_seq in {True, False}:
+            self.attrs['use_seq'] = use_seq
+            self.check_output(check_dygraph=False, no_check_set=["Cell"])
+class TestFusionLSTMONEDNNOpReverse(TestFusionLSTMONEDNNOp):
+    def set_conf(self):
+        self.is_reverse = True
+        self.use_mkldnn = True
+class TestFusionLSTMONEDNNOpInitReverse(TestFusionLSTMONEDNNOp):
+    def set_conf(self):
+        self.has_initial_state = True
+        self.is_reverse = True
+        self.use_mkldnn = True
+class TestFusionLSTMONEDNNOpMD1(TestFusionLSTMONEDNNOp):
+    def set_conf(self):
+        self.M = 36
+        self.D = 8
+        self.use_mkldnn = True
+class TestFusionLSTMONEDNNOpMD2(TestFusionLSTMONEDNNOp):
+    def set_conf(self):
+        self.M = 8
+        self.D = 8
+        self.use_mkldnn = True
+class TestFusionLSTMONEDNNOpMD3(TestFusionLSTMONEDNNOp):
+    def set_conf(self):
+        self.M = 15
+        self.D = 3
+        self.use_mkldnn = True
+class TestFusionLSTMONEDNNOpBS1(TestFusionLSTMONEDNNOp):
+    def set_conf(self):
+        self.lod = [[3]]
+        self.D = 16
+        self.use_mkldnn = True
+class TestFusionLSTMONEDNNOpPeepholesInit(TestFusionLSTMONEDNNOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.has_initial_state = True
+        self.use_mkldnn = True
+if __name__ == '__main__':
+    from paddle import enable_static
+    enable_static()
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
@@ -144,4 +144,6 @@ class TestFusionGRUOpBS1(TestFusionGRUOp):
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -58,6 +58,7 @@ class TestFusionLSTMOp(OpTest):
        self.act_gate = 'sigmoid'
        self.act_cell = 'tanh'
        self.act_cand = 'tanh'
+        self.use_mkldnn = False
        self.set_conf()
        T = sum(self.lod[0])
@@ -110,7 +111,8 @@ class TestFusionLSTMOp(OpTest):
            'is_reverse': self.is_reverse,
            'gate_activation': self.act_gate,
            'cell_activation': self.act_cell,
-            'candidate_activation': self.act_cand
+            'candidate_activation': self.act_cand,
+            'use_mkldnn': self.use_mkldnn
        }
    def test_check_output(self):
@@ -191,4 +193,6 @@ class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
 if __name__ == '__main__':
+    from paddle import enable_static
+    enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -29,4 +29,5 @@ no_check_set_white_list = [
    'update_loss_scaling',
    'cudnn_lstm',
    'rnn',
+    'fusion_lstm',
 ]
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -601,6 +601,7 @@ STATIC_MODE_TESTING_LIST = [
    'test_bilinear_interp_mkldnn_op',
    'test_fusion_gru_int8_mkldnn_op',
    'test_fusion_gru_mkldnn_op',
+    'test_fusion_lstm_mkldnn_op',
    'test_gaussian_random_mkldnn_op',
    'test_lrn_mkldnn_op',
    'test_matmul_mkldnn_op',