refine and replace lstm peephole kernel

8e182170 · tensor-tang · 7ef2699e · 8e182170 · 8e182170 · 8e182170
3 changed file
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -15,11 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fusion_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
 #include "paddle/fluid/operators/math/jit_kernel.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
-#include "paddle/fluid/platform/cpu_info.h"
 namespace paddle {
 namespace operators {
@@ -219,116 +217,55 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
 template <typename T>
 class FuisonLSTMKernel : public framework::OpKernel<T> {
 public:
-#define INIT_VEC_FUNC                                                          \
+#define INIT_BASE_DEFINES                                   \
-  std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
+  using DeviceContext = paddle::platform::CPUDeviceContext; \
-  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
+  auto* x = ctx.Input<LoDTensor>("X");                      \
-  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
+  auto* h0 = ctx.Input<Tensor>("H0");                       \
-  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
+  auto* c0 = ctx.Input<Tensor>("C0");                       \
-  if (platform::jit::MayIUse(platform::jit::avx)) {                            \
+  auto* wx = ctx.Input<Tensor>("WeightX");                  \
-    math::VecActivations<T, platform::jit::avx> act_functor;                   \
+  auto* wh = ctx.Input<Tensor>("WeightH");                  \
-    act_gate = act_functor(act_gate_str);                                      \
+  auto* bias = ctx.Input<Tensor>("Bias");                   \
-    act_cell = act_functor(act_cell_str);                                      \
+  auto* xx = ctx.Output<LoDTensor>("XX");                   \
-    act_cand = act_functor(act_cand_str);                                      \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");       \
-  } else {                                                                     \
+  auto* cell_out = ctx.Output<LoDTensor>("Cell");           \
-    math::VecActivations<T, platform::jit::isa_any> act_functor;               \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");           \
-    act_gate = act_functor(act_gate_str);                                      \
+  bool use_peepholes = ctx.Attr<bool>("use_peepholes");     \
-    act_cell = act_functor(act_cell_str);                                      \
+  auto x_dims = x->dims();   /* T x M*/                     \
-    act_cand = act_functor(act_cand_str);                                      \
+  auto wh_dims = wh->dims(); /* D x 4D*/                    \
-  }
+  const int M = x_dims[1];                                  \
+  const int D = wh_dims[0];                                 \
-#define INIT_BASE_INPUT_OUTPUT                        \
+  const int D4 = wh_dims[1]
-  auto* x = ctx.Input<LoDTensor>("X");                \
-  auto* h0 = ctx.Input<Tensor>("H0");                 \
+#define INIT_OTHER_DEFINES                                                  \
-  auto* c0 = ctx.Input<Tensor>("C0");                 \
+  const T* x_data = x->data<T>();                                           \
-  auto* wx = ctx.Input<Tensor>("WeightX");            \
+  const T* wx_data = wx->data<T>();                                         \
-  auto* wh = ctx.Input<Tensor>("WeightH");            \
+  const T* wh_data = wh->data<T>();                                         \
-  auto* bias = ctx.Input<Tensor>("Bias");             \
+  /* diagonal weight*/                                                      \
-  auto* xx = ctx.Output<LoDTensor>("XX");             \
+  const T* wp_data = bias->data<T>() + D4;                                  \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
+  /* for peephole only*/                                                    \
-  auto* cell_out = ctx.Output<LoDTensor>("Cell");     \
+  T* checked_cell_data = nullptr;                                           \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
+  auto place = ctx.GetPlace();                                              \
-  bool use_peepholes = ctx.Attr<bool>("use_peepholes");
+  if (use_peepholes) {                                                      \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                        \
-#define INIT_BASE_SIZES                  \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                 \
-  auto x_dims = x->dims();   /* T x M*/  \
+    checked_cell_data = checked_cell->mutable_data<T>(place);               \
-  auto wh_dims = wh->dims(); /* D x 4D*/ \
+  }                                                                         \
-  const int M = x_dims[1];               \
+  const auto& ker =                                                         \
-  const int D = wh_dims[0];              \
+      math::jitkernel::KernelPool::Instance()                               \
-  const int D2 = D * 2;                  \
+          .template Get<math::jitkernel::LSTMKernel<T>, const std::string&, \
-  const int D3 = D * 3;                  \
+                        const std::string&, const std::string&>(            \
-  const int D4 = wh_dims[1];
+              ctx.Attr<std::string>("gate_activation"),                     \
+              ctx.Attr<std::string>("candidate_activation"),                \
-#define INIT_BASE_INPUT_DATAS                                 \
+              ctx.Attr<std::string>("cell_activation"), D, use_peepholes)
-  const T* x_data = x->data<T>();                             \
-  const T* wx_data = wx->data<T>();                           \
+// Wh GEMM
-  const T* wh_data = wh->data<T>();                           \
-  /* diagonal weight*/                                        \
-  const T* wc_data = bias->data<T>() + D4;                    \
-  /* for peephole only*/                                      \
-  T* checked_cell_data = nullptr;                             \
-  auto place = ctx.GetPlace();                                \
-  if (use_peepholes) {                                        \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/          \
-    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");   \
-    checked_cell_data = checked_cell->mutable_data<T>(place); \
-  }
-/// Compute LSTM
 #define GEMM_WH_ADDON(bs, prev, out)                                           \
  blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
            wh_data, D4, static_cast<T>(1), out, D4)
-#define GET_Ct(ct_1, gates, ct)                   \
-  /* C_t = C_t-1 * fgated + cand_gated * igated*/ \
-  act_cand(D, gates, gates);                      \
-  blas.VMUL(D, gates, gates + D, gates + D);      \
-  blas.VMUL(D, ct_1, gates + D2, gates + D2);     \
-  blas.VADD(D, gates + D, gates + D2, ct)
-#define GET_Ht(ct, gates, ht)        \
-  /* H_t = act_cell(C_t) * ogated */ \
-  act_cell(D, ct, gates + D2);       \
-  blas.VMUL(D, gates + D2, gates + D3, ht)
-#define GET_Ct_NOH0C0(gates, ct)     \
-  /* C_t = igated * cgated*/         \
-  act_gate(D, gates + D, gates + D); \
-  act_cand(D, gates, gates);         \
-  blas.VMUL(D, gates, gates + D, ct)
-#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
-  GET_Ct_NOH0C0(gates, ct);                \
-  act_gate(D, gates + D3, gates + D3);     \
-  GET_Ht(ct, gates, ht)
-#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
-  GET_Ct_NOH0C0(gates, ct);                         \
-  /* get outgated, put W_oc * C_t on igated */      \
-  blas.VMUL(D, wc_data + D2, ct, gates + D);        \
-  blas.VADD(D, gates + D, gates + D3, gates + D3);  \
-  act_gate(D, gates + D3, gates + D3);              \
-  GET_Ht(ct, gates, ht)
-#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht)        \
-  /* get fgated and igated*/                              \
-  blas.VMUL(D, wc_data, ct_1, checked_cell_data);         \
-  blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
-  blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
-  act_gate(D2, gates + D, gates + D);                     \
-  GET_Ct(ct_1, gates, ct);                                \
-  /* get ogated*/                                         \
-  blas.VMUL(D, wc_data + D2, ct, gates + D);              \
-  blas.VADD(D, gates + D, gates + D3, gates + D3);        \
-  act_gate(D, gates + D3, gates + D3);                    \
-  GET_Ht(ct, gates, ht)
  void SeqCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
+    INIT_BASE_DEFINES;
-    INIT_BASE_INPUT_OUTPUT
+    INIT_OTHER_DEFINES;
-    INIT_BASE_SIZES
-    INIT_VEC_FUNC
-    INIT_BASE_INPUT_DATAS
    auto x_lod = x->lod();
    const int total_T = x_dims[0];
    const int N = x_lod[0].size() - 1;
@@ -352,84 +289,47 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
      gate_offset = -D;
    }
-#define MOVE_ONE_STEP                    \
+    for (int i = 0; i < N; ++i) {
-  prev_h_data = h_out_data;              \
+      int bid = is_reverse ? N - 1 - i : i;
-  prev_c_data = c_out_data;              \
+      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
-  xx_data = xx_data + xx_offset;         \
+      const T* prev_c_data = nullptr;
-  h_out_data = h_out_data + gate_offset; \
+      const T* prev_h_data = nullptr;
-  c_out_data = c_out_data + gate_offset
+      int tstart = 0;
+      if (h0_data) {
-#define PROCESS_H0C0_DEFINES                       \
+        prev_h_data = h0_data + bid * D;
-  int bid = is_reverse ? N - 1 - i : i;            \
+        prev_c_data = c0_data + bid * D;
-  int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \
+      } else {
-  const T* prev_c_data = nullptr;                  \
+        ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data);
-  const T* prev_h_data = nullptr;                  \
+        tstart = 1;
-  int tstart = 0
+        // move one step
+        prev_h_data = h_out_data;
-#define PROCESS_H0C0_PEEPHOLE                                      \
+        prev_c_data = c_out_data;
-  PROCESS_H0C0_DEFINES;                                            \
+        xx_data = xx_data + xx_offset;
-  if (h0_data) {                                                   \
+        h_out_data = h_out_data + gate_offset;
-    prev_h_data = h0_data + bid * D;                               \
+        c_out_data = c_out_data + gate_offset;
-    prev_c_data = c0_data + bid * D;                               \
-  } else {                                                         \
-    COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
-    MOVE_ONE_STEP;                                                 \
-    tstart = 1;                                                    \
-  }
-#define PROCESS_H0C0                                      \
-  PROCESS_H0C0_DEFINES;                                   \
-  if (h0_data) {                                          \
-    prev_h_data = h0_data + bid * D;                      \
-    prev_c_data = c0_data + bid * D;                      \
-  } else {                                                \
-    COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
-    MOVE_ONE_STEP;                                        \
-    tstart = 1;                                           \
-  }
-    if (use_peepholes) {
-      for (int i = 0; i < N; ++i) {
-        PROCESS_H0C0_PEEPHOLE
-        for (int step = tstart; step < seq_len; ++step) {
-          GEMM_WH_ADDON(1, prev_h_data, xx_data);
-          COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data);
-          MOVE_ONE_STEP;
-        }
      }
-    } else {
+      for (int step = tstart; step < seq_len; ++step) {
-      const auto& ker =
+        GEMM_WH_ADDON(1, prev_h_data, xx_data);
-          math::jitkernel::KernelPool::Instance()
+        ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data,
-              .template Get<math::jitkernel::LSTMKernel<T>, const std::string&,
+                         checked_cell_data);
-                            const std::string&, const std::string&>(
+        // move one step
-                  act_gate_str, act_cand_str, act_cell_str, D, false);
+        prev_h_data = h_out_data;
+        prev_c_data = c_out_data;
-      for (int i = 0; i < N; ++i) {
+        xx_data = xx_data + xx_offset;
-        PROCESS_H0C0
+        h_out_data = h_out_data + gate_offset;
-        for (int step = tstart; step < seq_len; ++step) {
+        c_out_data = c_out_data + gate_offset;
-          GEMM_WH_ADDON(1, prev_h_data, xx_data);
-          ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data);
-          MOVE_ONE_STEP;
-        }
      }
    }
-#undef PROCESS_H0C0_DEFINES
-#undef PROCESS_H0C0_PEEPHOLE
-#undef PROCESS_H0C0
-#undef MOVE_ONE_STEP
  }
  void BatchCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = platform::CPUDeviceContext;
+    INIT_BASE_DEFINES;
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
    if (x->lod()[0].size() == 2) {
      xx->Resize({x_dims[0], D4});
      SeqCompute(ctx);
      return;
    }
-    INIT_VEC_FUNC
+    INIT_OTHER_DEFINES;
-    INIT_BASE_INPUT_DATAS
    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
    auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
@@ -477,8 +377,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
      prev_c_data = reordered_c0_data;
      size_t sz = sizeof(T) * D;
      for (int i = 0; i < max_bs; ++i) {
-        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
+        blas.VCOPY(sz, h0_data + seq_order[i] * D, reordered_h0_data);
-        std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz);
+        blas.VCOPY(sz, c0_data + seq_order[i] * D, reordered_c0_data);
        reordered_h0_data += D;
        reordered_c0_data += D;
      }
@@ -488,13 +388,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
      T* cur_h_out_data = batched_h_out_data;
      T* cur_c_out_data = batched_c_out_data;
      for (int i = 0; i < max_bs; ++i) {
-        GET_Ct_NOH0C0(cur_in_data, cur_c_out_data);
+        ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data);
-        if (use_peepholes) {
-          blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
-          blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
-        }
-        act_gate(D, cur_in_data + D3, cur_in_data + D3);
-        GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
        cur_in_data += D4;
        cur_c_out_data += D;
        cur_h_out_data += D;
@@ -503,66 +397,37 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
      prev_h_data = batched_h_out_data;
      prev_c_data = batched_c_out_data;
    }
+    // compute kernel part
    const auto& batch_starts = batched_lod[0];
    const int max_seq_len = batch_starts.size() - 1;
    const int offset = tstart * max_bs * D;
    batched_input_data = batched_input_data + offset * 4;
    batched_h_out_data = batched_h_out_data + offset;
    batched_c_out_data = batched_c_out_data + offset;
+    for (int step = tstart; step < max_seq_len; ++step) {
-#define DEFINE_CUR                        \
+      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-  T* cur_in_data = batched_input_data;    \
+      GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-  T* cur_prev_c_data = prev_c_data;       \
+      T* cur_in_data = batched_input_data;
-  T* cur_c_out_data = batched_c_out_data; \
+      T* cur_prev_c_data = prev_c_data;
-  T* cur_h_out_data = batched_h_out_data
+      T* cur_c_out_data = batched_c_out_data;
+      T* cur_h_out_data = batched_h_out_data;
-#define MOVE_ONE_BATCH  \
+      for (int i = 0; i < cur_bs; ++i) {
-  cur_in_data += D4;    \
+        ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
-  cur_prev_c_data += D; \
+                         cur_h_out_data, wp_data, checked_cell_data);
-  cur_c_out_data += D;  \
+        // move one batch
-  cur_h_out_data += D
+        cur_in_data += D4;
+        cur_prev_c_data += D;
-#define MOVE_ONE_STEP                  \
+        cur_c_out_data += D;
-  prev_c_data = batched_c_out_data;    \
+        cur_h_out_data += D;
-  prev_h_data = batched_h_out_data;    \
-  batched_c_out_data = cur_c_out_data; \
-  batched_h_out_data = cur_h_out_data; \
-  batched_input_data = cur_in_data
-    if (use_peepholes) {
-      for (int step = tstart; step < max_seq_len; ++step) {
-        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-        DEFINE_CUR;
-        for (int i = 0; i < cur_bs; ++i) {
-          COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
-                                cur_h_out_data);
-          MOVE_ONE_BATCH;
-        }
-        MOVE_ONE_STEP;
-      }
-    } else {
-      const auto& ker =
-          math::jitkernel::KernelPool::Instance()
-              .template Get<math::jitkernel::LSTMKernel<T>, const std::string&,
-                            const std::string&, const std::string&>(
-                  act_gate_str, act_cand_str, act_cell_str, D, false);
-      for (int step = tstart; step < max_seq_len; ++step) {
-        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-        DEFINE_CUR;
-        for (int i = 0; i < cur_bs; ++i) {
-          ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
-                           cur_h_out_data);
-          MOVE_ONE_BATCH;
-        }
-        MOVE_ONE_STEP;
      }
+      // move one step
+      prev_c_data = batched_c_out_data;
+      prev_h_data = batched_h_out_data;
+      batched_c_out_data = cur_c_out_data;
+      batched_h_out_data = cur_h_out_data;
+      batched_input_data = cur_in_data;
    }
-#undef MOVE_ONE_STEP
-#undef MOVE_ONE_BATCH
-#undef DEFINE_CUR
    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
    batched_h_out->set_lod(batched_lod);
@@ -579,17 +444,9 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
    }
  }
-#undef COMPUTE_CtHt_PEEPHOLE
-#undef GET_Ct_NOH0C0
-#undef COMPUTE_CtHt_NOH0C0
-#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
-#undef GET_Ht
-#undef GET_Ct
 #undef GEMM_WH_ADDON
-#undef INIT_BASE_INPUT_DATAS
+#undef INIT_OTHER_DEFINES
-#undef INIT_BASE_SIZES
+#undef INIT_BASE_DEFINES
-#undef INIT_BASE_INPUT_OUTPUT
-#undef INIT_VEC_FUNC
 };
 }  // namespace operators

--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -126,7 +126,14 @@ template <typename T>
 class LSTMKernel : public Kernel {
 public:
  virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht,
+                           /* below only used in peephole*/
+                           const T *wp_data = nullptr,
                           T *checked = nullptr) const = 0;
+  // compute c1 and h1 without c0 or h0
+  virtual void ComputeC1H1(T *gates, T *ct, T *ht,
+                           /* below only used in peephole*/
+                           const T *wp_data = nullptr) const = 0;
 };
 }  // namespace jitkernel

--- a/paddle/fluid/operators/math/jit_kernel_lstm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_lstm.cc
@@ -82,6 +82,26 @@ __m256 AVXActImpl<kIdentity>::Compute(__m256 x) const {
 }
 #endif
+template <typename T>
+static std::shared_ptr<const VActKernel<T>> GetActKernel(
+    const std::string& type, int n) {
+  if (type == "sigmoid") {
+    return std::dynamic_pointer_cast<const VActKernel<T>>(
+        KernelPool::Instance().template Get<VSigmoidKernel<T>>(n));
+  } else if (type == "relu") {
+    return std::dynamic_pointer_cast<const VActKernel<T>>(
+        KernelPool::Instance().template Get<VReluKernel<T>>(n));
+  } else if (type == "tanh") {
+    return std::dynamic_pointer_cast<const VActKernel<T>>(
+        KernelPool::Instance().template Get<VTanhKernel<T>>(n));
+  } else if (type == "identity" || type == "") {
+    return std::dynamic_pointer_cast<const VActKernel<T>>(
+        KernelPool::Instance().template Get<VIdentityKernel<T>>(n));
+  }
+  PADDLE_THROW("Not support type: %s", type);
+  return nullptr;
+}
 /* LSTM JitKernel */
 template <typename T, jit::cpu_isa_t isa, jit_block>
 class LSTMKernelImpl : public LSTMKernel<T> {
@@ -93,26 +113,10 @@ class LSTMKernelImpl : public LSTMKernel<T> {
    d_ = d;
    d2_ = d * 2;
    d3_ = d * 3;
-    auto GetActKernel = [&](const std::string& type,
+    act_gate_d3_ = GetActKernel<T>(act_gate, d3_);
-                            int n) -> std::shared_ptr<const VActKernel<T>> {
+    act_gate_d_ = GetActKernel<T>(act_gate, d);
-      if (type == "sigmoid") {
+    act_cand_d_ = GetActKernel<T>(act_cand, d);
-        return std::dynamic_pointer_cast<const VActKernel<T>>(
+    act_cell_d_ = GetActKernel<T>(act_cell, d);
-            KernelPool::Instance().template Get<VSigmoidKernel<T>>(n));
-      } else if (type == "relu") {
-        return std::dynamic_pointer_cast<const VActKernel<T>>(
-            KernelPool::Instance().template Get<VReluKernel<T>>(n));
-      } else if (type == "tanh") {
-        return std::dynamic_pointer_cast<const VActKernel<T>>(
-            KernelPool::Instance().template Get<VTanhKernel<T>>(n));
-      } else if (type == "identity" || type == "") {
-        return std::dynamic_pointer_cast<const VActKernel<T>>(
-            KernelPool::Instance().template Get<VIdentityKernel<T>>(n));
-      }
-      PADDLE_THROW("Not support type: %s", type);
-    };
-    act_gate_3d_ = GetActKernel(act_gate, d * 3);
-    act_cand_d_ = GetActKernel(act_cand, d);
-    act_cell_d_ = GetActKernel(act_cell, d);
    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
    vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
 #ifdef __AVX__
@@ -134,10 +138,10 @@ class LSTMKernelImpl : public LSTMKernel<T> {
 #endif
  }
-  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht,
+  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
                   T* checked) const override {
    // gates: W_ch, W_ih, W_fh, W_oh
-    act_gate_3d_->Compute(gates + d_, gates + d_);
+    act_gate_d3_->Compute(gates + d_, gates + d_);
    /* C_t = C_t-1 * fgated + cand_gated * igated */
    act_cand_d_->Compute(gates, gates);
@@ -149,10 +153,21 @@ class LSTMKernelImpl : public LSTMKernel<T> {
    act_cell_d_->Compute(ct, gates + d2_);
    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
  }
+  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
+    /* C_t = igated * cgated*/
+    act_gate_d_->Compute(gates + d_, gates + d_);
+    act_cand_d_->Compute(gates, gates);
+    vmul_d_->Compute(gates, gates + d_, ct);
+    /* H_t = act_cell(C_t) * ogated */
+    act_gate_d_->Compute(gates + d3_, gates + d3_);
+    act_cell_d_->Compute(ct, gates + d2_);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+  }
 private:
  int d_, d2_, d3_;
-  std::shared_ptr<const VActKernel<T>> act_gate_3d_, act_cand_d_, act_cell_d_;
+  std::shared_ptr<const VActKernel<T>> act_gate_d3_, act_gate_d_, act_cand_d_,
+      act_cell_d_;
  std::shared_ptr<const VMulKernel<T>> vmul_d_;
  std::shared_ptr<const VAddKernel<T>> vadd_d_;
 #ifdef __AVX__
@@ -163,8 +178,8 @@ class LSTMKernelImpl : public LSTMKernel<T> {
 #define INTRI8_FLOAT(isa)                                                    \
  template <>                                                                \
  void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt(                        \
-      float* gates, const float* ct_1, float* ct, float* ht, float* checked) \
+      float* gates, const float* ct_1, float* ct, float* ht,                 \
-      const {                                                                \
+      const float* wp_data, float* checked) const {                          \
    /* gates: W_ch, W_ih, W_fh, W_oh */                                      \
    __m256 c, i, f, o;                                                       \
    c = _mm256_loadu_ps(gates);                                              \
@@ -205,51 +220,56 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
    d_ = d;
    d2_ = d * 2;
    d3_ = d * 3;
-    auto GetActKernel = [&](const std::string& type,
+    act_gate_d_ = GetActKernel<T>(act_gate, d);
-                            int n) -> std::shared_ptr<const VActKernel<T>> {
+    act_cand_d_ = GetActKernel<T>(act_cand, d);
-      if (type == "sigmoid") {
+    act_cell_d_ = GetActKernel<T>(act_cell, d);
-        return std::dynamic_pointer_cast<const VActKernel<T>>(
-            KernelPool::Instance().template Get<VSigmoidKernel<T>>(n));
-      } else if (type == "relu") {
-        return std::dynamic_pointer_cast<const VActKernel<T>>(
-            KernelPool::Instance().template Get<VReluKernel<T>>(n));
-      } else if (type == "tanh") {
-        return std::dynamic_pointer_cast<const VActKernel<T>>(
-            KernelPool::Instance().template Get<VTanhKernel<T>>(n));
-      } else if (type == "identity" || type == "") {
-        return std::dynamic_pointer_cast<const VActKernel<T>>(
-            KernelPool::Instance().template Get<VIdentityKernel<T>>(n));
-      }
-      PADDLE_THROW("Not support type: %s", type);
-    };
-    act_gate_3d_ = GetActKernel(act_gate, d * 3);
-    act_cand_d_ = GetActKernel(act_cand, d);
-    act_cell_d_ = GetActKernel(act_cell, d);
    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
    vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
+    vadd_d2_ = KernelPool::Instance().template Get<VAddKernel<T>>(d2_);
+    act_gate_d2_ = GetActKernel<T>(act_gate, d2_);
  }
-  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht,
+  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
                   T* checked) const override {
-    // gates: W_ch, W_ih, W_fh, W_oh
+    /* get fgated and igated*/
-    act_gate_3d_->Compute(gates + d_, gates + d_);
+    vmul_d_->Compute(wp_data, ct_1, checked);
+    vmul_d_->Compute(wp_data + d_, ct_1, checked + d_);
-    /* C_t = C_t-1 * fgated + cand_gated * igated */
+    vadd_d2_->Compute(checked, gates + d_, gates + d_);
+    act_gate_d2_->Compute(gates + d_, gates + d_);
+    /* C_t = C_t-1 * fgated + cand_gated * igated*/
    act_cand_d_->Compute(gates, gates);
    vmul_d_->Compute(gates, gates + d_, gates + d_);
    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_);
    vadd_d_->Compute(gates + d_, gates + d2_, ct);
+    /* get ogated*/
+    vmul_d_->Compute(wp_data + d2_, ct, gates + d_);
+    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
+    act_gate_d_->Compute(gates + d3_, gates + d3_);
+    /* H_t = act_cell(C_t) * ogated */
+    act_cell_d_->Compute(ct, gates + d2_);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+  }
+  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
+    /* C_t = igated * cgated*/
+    act_gate_d_->Compute(gates + d_, gates + d_);
+    act_cand_d_->Compute(gates, gates);
+    vmul_d_->Compute(gates, gates + d_, ct);
+    /* get outgated, put W_oc * C_t on igated */
+    vmul_d_->Compute(wp_data + d2_, ct, gates + d_);
+    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
    /* H_t = act_cell(C_t) * ogated */
+    act_gate_d_->Compute(gates + d3_, gates + d3_);
    act_cell_d_->Compute(ct, gates + d2_);
    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
  }
 private:
  int d_, d2_, d3_;
-  std::shared_ptr<const VActKernel<T>> act_gate_3d_, act_cand_d_, act_cell_d_;
+  std::shared_ptr<const VActKernel<T>> act_gate_d2_, act_gate_d_, act_cand_d_,
+      act_cell_d_;
  std::shared_ptr<const VMulKernel<T>> vmul_d_;
-  std::shared_ptr<const VAddKernel<T>> vadd_d_;
+  std::shared_ptr<const VAddKernel<T>> vadd_d_, vadd_d2_;
 };
 #define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype)                  \