Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_cudnn_lstm

1ffe41d7 · liuhongyu · 05917c3c · 1c48d614 · 1ffe41d7 · 1ffe41d7
23 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -43,6 +43,8 @@ RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
    make -j8 > /dev/null && make altinstall > /dev/null

+RUN rm -r /root/python_build
+
 RUN apt-get update && \
    apt-get install -y --allow-downgrades patchelf \
    python3 python3-dev python3-pip \

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -186,8 +186,7 @@ set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
       ${src_dir}/${module}/api/paddle_*.h
-       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
        )

 set(module "platform")

--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
@@ -99,9 +99,8 @@ TEST(BestFitAllocator, test_concurrent_cpu_allocation) {

  LockedAllocator locked_allocator(std::move(best_fit_allocator));

-  auto th_main = [&] {
-    std::random_device dev;
-    std::default_random_engine engine(dev());
+  auto th_main = [&](std::random_device::result_type seed) {
+    std::default_random_engine engine(seed);
    std::uniform_int_distribution<size_t> dist(1U, 1024U);

    for (size_t i = 0; i < 128; ++i) {
@@ -125,7 +124,8 @@ TEST(BestFitAllocator, test_concurrent_cpu_allocation) {
  {
    std::vector<std::thread> threads;
    for (size_t i = 0; i < 1024; ++i) {
-      threads.emplace_back(th_main);
+      std::random_device dev;
+      threads.emplace_back(th_main, dev());
    }
    for (auto& th : threads) {
      th.join();

--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -41,9 +41,8 @@ TEST(BestFitAllocator, concurrent_cuda) {
  LockedAllocator concurrent_allocator(
      std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));

-  auto th_main = [&] {
-    std::random_device dev;
-    std::default_random_engine engine(dev());
+  auto th_main = [&](std::random_device::result_type seed) {
+    std::default_random_engine engine(seed);
    std::uniform_int_distribution<size_t> dist(1U, 1024U);
    platform::CUDAPlace gpu(0);
    platform::CUDADeviceContext dev_ctx(gpu);
@@ -75,7 +74,8 @@ TEST(BestFitAllocator, concurrent_cuda) {
  {
    std::vector<std::thread> threads;
    for (size_t i = 0; i < 1024; ++i) {
-      threads.emplace_back(th_main);
+      std::random_device dev;
+      threads.emplace_back(th_main, dev());
    }
    for (auto& th : threads) {
      th.join();

--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -192,11 +192,14 @@ class FusionGRUKernel : public framework::OpKernel<T> {
  const int M = x_dims[1];                                         \
  const int D = wh_dims[0];                                        \
  const int D2 = D * 2;                                            \
-  const auto& ker = math::jitkernel::KernelPool::Instance()                    \
+  const math::jitkernel::gru_attr_t attr(                          \
+      D, ctx.Attr<std::string>("gate_activation"),                 \
+      ctx.Attr<std::string>("activation"));                        \
+  math::jitkernel::gru_t one_step;                                 \
+  const auto& ker =                                                \
+      math::jitkernel::KernelPool::Instance()                      \
          .template Get<math::jitkernel::GRUKernel<T>,             \
-                                      const std::string&, const std::string&>( \
-                            ctx.Attr<std::string>("gate_activation"),          \
-                            ctx.Attr<std::string>("activation"), D);           \
+                        const math::jitkernel::gru_attr_t&>(attr); \
  const T* x_data = x->data<T>();                                  \
  const T* wx_data = wx->data<T>();                                \
  const T* wh_data = wh->data<T>();                                \
@@ -237,7 +240,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
      if (h0_data) {
        prev_hidden_data = h0_data + bid * D;
      } else {
-        ker->ComputeH1(xx_data, hidden_out_data);
+        one_step.gates = xx_data;
+        one_step.ht = hidden_out_data;
+        ker->ComputeH1(&one_step, &attr);
        prev_hidden_data = hidden_out_data;
        tstart = 1;
        move_step();
@@ -247,12 +252,15 @@ class FusionGRUKernel : public framework::OpKernel<T> {
        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
                  prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
                  D3);
-        ker->ComputeHtPart1(xx_data, prev_hidden_data, hidden_out_data);
+        one_step.gates = xx_data;
+        one_step.ht_1 = prev_hidden_data;
+        one_step.ht = hidden_out_data;
+        ker->ComputeHtPart1(&one_step, &attr);
        // gemm rt * Ws
        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
                  hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
                  xx_data + D2, D3);
-        ker->ComputeHtPart2(xx_data, prev_hidden_data, hidden_out_data);
+        ker->ComputeHtPart2(&one_step, &attr);
        // save prev
        prev_hidden_data = hidden_out_data;
        move_step();
@@ -314,7 +322,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
      T* cur_out_data = batched_out_data;
      // W: {W_update, W_reset; W_state}
      for (int i = 0; i < max_bs; ++i) {
-        ker->ComputeH1(cur_in_data, cur_out_data);
+        one_step.gates = cur_in_data;
+        one_step.ht = cur_out_data;
+        ker->ComputeH1(&one_step, &attr);
        // add offset
        cur_in_data += D3;
        cur_out_data += D;
@@ -339,8 +349,11 @@ class FusionGRUKernel : public framework::OpKernel<T> {
      T* cur_out_data = batched_out_data;
      T* cur_prev_hidden_data = prev_hidden_data;
      for (int i = 0; i < cur_bs; ++i) {
-        ker->ComputeHtPart1(cur_batched_data, cur_prev_hidden_data,
-                            cur_out_data);
+        one_step.gates = cur_batched_data;
+        one_step.ht_1 = cur_prev_hidden_data;
+        one_step.ht = cur_out_data;
+        ker->ComputeHtPart1(&one_step, &attr);
+
        cur_batched_data += D3;
        cur_prev_hidden_data += D;
        cur_out_data += D;
@@ -354,8 +367,10 @@ class FusionGRUKernel : public framework::OpKernel<T> {

      cur_prev_hidden_data = prev_hidden_data;
      for (int i = 0; i < cur_bs; ++i) {
-        ker->ComputeHtPart2(cur_batched_data, cur_prev_hidden_data,
-                            cur_out_data);
+        one_step.gates = cur_batched_data;
+        one_step.ht_1 = cur_prev_hidden_data;
+        one_step.ht = cur_out_data;
+        ker->ComputeHtPart2(&one_step, &attr);
        cur_batched_data += D3;
        cur_prev_hidden_data += D;
        cur_out_data += D;

--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -250,13 +250,17 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");     \
    checked_cell_data = checked_cell->mutable_data<T>(place);   \
  }                                                             \
+  const math::jitkernel::lstm_attr_t attr(                      \
+      D, ctx.Attr<std::string>("gate_activation"),              \
+      ctx.Attr<std::string>("candidate_activation"),            \
+      ctx.Attr<std::string>("cell_activation"), use_peepholes); \
+  math::jitkernel::lstm_t one_step;                             \
+  one_step.wp = wp_data;                                        \
+  one_step.checked = checked_cell_data;                         \
  const auto& ker =                                             \
      math::jitkernel::KernelPool::Instance()                   \
-          .template Get<math::jitkernel::LSTMKernel<T>, const std::string&, \
-                        const std::string&, const std::string&>(            \
-              ctx.Attr<std::string>("gate_activation"),                     \
-              ctx.Attr<std::string>("candidate_activation"),                \
-              ctx.Attr<std::string>("cell_activation"), D, use_peepholes)
+          .template Get<math::jitkernel::LSTMKernel<T>,         \
+                        const math::jitkernel::lstm_attr_t&>(attr)

 // Wh GEMM
 #define GEMM_WH_ADDON(bs, prev, out)                                           \
@@ -299,7 +303,10 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
        prev_h_data = h0_data + bid * D;
        prev_c_data = c0_data + bid * D;
      } else {
-        ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data);
+        one_step.gates = xx_data;
+        one_step.ct = c_out_data;
+        one_step.ht = h_out_data;
+        ker->ComputeC1H1(&one_step, &attr);
        tstart = 1;
        // move one step
        prev_h_data = h_out_data;
@@ -310,8 +317,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
      }
      for (int step = tstart; step < seq_len; ++step) {
        GEMM_WH_ADDON(1, prev_h_data, xx_data);
-        ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data,
-                         checked_cell_data);
+
+        one_step.gates = xx_data;
+        one_step.ct_1 = prev_c_data;
+        one_step.ct = c_out_data;
+        one_step.ht = h_out_data;
+        ker->ComputeCtHt(&one_step, &attr);
        // move one step
        prev_h_data = h_out_data;
        prev_c_data = c_out_data;
@@ -388,7 +399,11 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
      T* cur_h_out_data = batched_h_out_data;
      T* cur_c_out_data = batched_c_out_data;
      for (int i = 0; i < max_bs; ++i) {
-        ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data);
+        one_step.gates = cur_in_data;
+        one_step.ct = cur_c_out_data;
+        one_step.ht = cur_h_out_data;
+        ker->ComputeC1H1(&one_step, &attr);
+
        cur_in_data += D4;
        cur_c_out_data += D;
        cur_h_out_data += D;
@@ -413,8 +428,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
      T* cur_c_out_data = batched_c_out_data;
      T* cur_h_out_data = batched_h_out_data;
      for (int i = 0; i < cur_bs; ++i) {
-        ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
-                         cur_h_out_data, wp_data, checked_cell_data);
+        one_step.gates = cur_in_data;
+        one_step.ct_1 = cur_prev_c_data;
+        one_step.ct = cur_c_out_data;
+        one_step.ht = cur_h_out_data;
+        ker->ComputeCtHt(&one_step, &attr);
+
        // move one batch
        cur_in_data += D4;
        cur_prev_c_data += D;

--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -76,11 +76,12 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {

    AddAttr<int>("out_h", "output height of interpolate op.");
    AddAttr<int>("out_w", "output width of interpolate op.");
-    AddAttr<std::string>(
-        "interp_method",
-        "(string), interpolation method, can be \"bilinear\" for "
+    AddAttr<std::string>("interp_method",
+                         "(string, default \"bilinear\"), interpolation "
+                         "method, can be \"bilinear\" for "
                         "bilinear interpolation and \"nearest\" for nearest "
-        "neighbor interpolation.");
+                         "neighbor interpolation.")
+        .SetDefault("bilinear");
    AddComment(R"DOC(
          This operator samples input X to given output shape by using specified
          interpolation method, the interpolation methods can be \"nearest\"
@@ -132,11 +133,19 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OPERATOR(interpolate, ops::InterpolateOp, ops::InterpolateOpMaker,
+REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(interpolate_grad, ops::InterpolateOpGrad);
-REGISTER_OP_CPU_KERNEL(interpolate, ops::InterpolateKernel<float>,
+REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad);
+REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad);
+REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel<float>,
+                       ops::InterpolateKernel<double>,
+                       ops::InterpolateKernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(bilinear_interp_grad, ops::InterpolateGradKernel<float>,
+                       ops::InterpolateGradKernel<double>);
+REGISTER_OP_CPU_KERNEL(nearest_interp, ops::InterpolateKernel<float>,
                       ops::InterpolateKernel<double>,
                       ops::InterpolateKernel<uint8_t>);
-REGISTER_OP_CPU_KERNEL(interpolate_grad, ops::InterpolateGradKernel<float>,
+REGISTER_OP_CPU_KERNEL(nearest_interp_grad, ops::InterpolateGradKernel<float>,
                       ops::InterpolateGradKernel<double>);
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -284,9 +284,15 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(interpolate, ops::InterpolateOpCUDAKernel<float>,
+REGISTER_OP_CUDA_KERNEL(bilinear_interp, ops::InterpolateOpCUDAKernel<float>,
                        ops::InterpolateOpCUDAKernel<double>,
                        ops::InterpolateOpCUDAKernel<int>);
-REGISTER_OP_CUDA_KERNEL(interpolate_grad,
+REGISTER_OP_CUDA_KERNEL(bilinear_interp_grad,
+                        ops::InterpolateGradOpCUDAKernel<float>,
+                        ops::InterpolateGradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(nearest_interp, ops::InterpolateOpCUDAKernel<float>,
+                        ops::InterpolateOpCUDAKernel<double>,
+                        ops::InterpolateOpCUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(nearest_interp_grad,
                        ops::InterpolateGradOpCUDAKernel<float>,
                        ops::InterpolateGradOpCUDAKernel<double>);
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/math/jit_code.h"
+#include <stddef.h>                                  // offsetof
 #include "paddle/fluid/operators/math/jit_kernel.h"  // TODO(TJ): remove me

 namespace paddle {
@@ -139,32 +140,10 @@ bool VActJitCode::init(int d, operand_type type) {
 }

 void VActJitCode::generate() {
-  xmm_t xmm_zero = xmm_t(2);
-  ymm_t ymm_zero = ymm_t(2);
-  if (type_ == operand_type::relu) {
-    vxorps(ymm_zero, ymm_zero, ymm_zero);
-  }
  int offset = 0;
  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
    vmovups(ymm_src, ptr[param1 + offset]);
-    switch (type_) {
-      case operand_type::relu:
-        relu_jmm<ymm_t>(ymm_dst, ymm_src, ymm_zero);
-        break;
-      case operand_type::exp:
-        exp_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::sigmoid:
-        sigmoid_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::tanh:
-        tanh_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::identity:
-        break;
-      default:
-        break;
-    }
+    act<ymm_t>(ymm_dst, ymm_src, type_);
    vmovups(ptr[param2 + offset], ymm_dst);
    offset += sizeof(float) * YMM_FLOAT_BLOCK;
  }
@@ -181,22 +160,7 @@ void VActJitCode::generate() {
      block = 1;
      vmovss(xmm_src, ptr[param1 + offset]);
    }
-    switch (type_) {
-      case operand_type::relu:
-        relu_jmm<xmm_t>(xmm_dst, xmm_src, xmm_zero);
-        break;
-      case operand_type::exp:
-        exp_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::sigmoid:
-        sigmoid_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::tanh:
-        tanh_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      default:
-        break;
-    }
+    act<xmm_t>(xmm_dst, xmm_src, type_);
    if (rest >= 4) {
      vmovups(ptr[param2 + offset], xmm_dst);
    } else if (rest >= 2) {
@@ -210,6 +174,158 @@ void VActJitCode::generate() {
  ret();
 }

+bool LSTMJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; }
+
+void LSTMJitCode::generate() {
+  if (use_peephole_) {
+    preCode();
+  }
+  reg64_t reg_ptr_gates = rax;
+  reg64_t reg_ptr_ct_1 = r9;
+  reg64_t reg_ptr_ct = r10;
+  reg64_t reg_ptr_ht = r11;
+  reg64_t reg_ptr_wp = r12;
+  mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]);
+  mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]);
+  mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]);
+  mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]);
+  if (use_peephole_) {
+    mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]);
+  }
+
+  int offset = 0;
+  int d = num_ * sizeof(float);
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    /* gates: W_ch, W_ih, W_fh, W_oh */
+    ymm_t ymm_c = ymm_t(0);
+    ymm_t ymm_i = ymm_t(1);
+    ymm_t ymm_f = ymm_t(2);
+    ymm_t ymm_o = ymm_t(3);
+    ymm_t ymm_ct_1 = ymm_t(4);
+    ymm_t ymm_wp0 = ymm_t(5);
+    ymm_t ymm_wp1 = ymm_t(6);
+    ymm_t ymm_wp2 = ymm_t(7);
+    vmovups(ymm_c, ptr[reg_ptr_gates + offset]);
+    vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]);
+    vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]);
+    vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]);
+    if (!compute_c1h1_) {
+      vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]);
+    }
+    if (use_peephole_) {
+      vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]);
+      vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]);
+      vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]);
+    }
+    /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */
+    // act_cand(c)
+    act<ymm_t>(ymm_c, ymm_c, act_cand_);
+    // act_gate(i) or act_gate(ct_1 * wp0 + i)
+    if (!compute_c1h1_ && use_peephole_) {
+      vmulps(ymm_wp0, ymm_ct_1, ymm_wp0);
+      vaddps(ymm_i, ymm_i, ymm_wp0);
+    }
+    act<ymm_t>(ymm_i, ymm_i, act_gate_);
+    vmulps(ymm_c, ymm_c, ymm_i);
+    if (!compute_c1h1_) {
+      // act_gate(f) or act_gate(ct_1 * wp1 + f)
+      if (use_peephole_) {
+        vmulps(ymm_wp1, ymm_ct_1, ymm_wp1);
+        vaddps(ymm_f, ymm_f, ymm_wp1);
+      }
+      act<ymm_t>(ymm_f, ymm_f, act_gate_);
+      // ct
+      vmulps(ymm_f, ymm_f, ymm_ct_1);
+      vaddps(ymm_f, ymm_f, ymm_c);
+    }
+    /* H_t = act_cell(C_t) * act_gate(o) */
+    // act_cell(C_t)
+    ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f;
+    ymm_t ymm_tmp = ymm_i;
+    act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
+    // act_gate(o) or act_gate(ct * wp2 + o)
+    if (use_peephole_) {
+      vmulps(ymm_wp2, ymm_ct, ymm_wp2);
+      vaddps(ymm_o, ymm_o, ymm_wp2);
+    }
+    act<ymm_t>(ymm_o, ymm_o, act_gate_);
+    // ht
+    vmulps(ymm_o, ymm_o, ymm_tmp);
+    // save ct and ht
+    vmovups(ptr[reg_ptr_ct + offset], ymm_ct);
+    vmovups(ptr[reg_ptr_ht + offset], ymm_o);
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+
+  if (use_peephole_) {
+    postCode();
+  } else {
+    ret();
+  }
+}
+
+bool GRUJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; }
+
+void GRUJitCode::generate() {
+  reg64_t reg_ptr_gates = rax;
+  reg64_t reg_ptr_ht_1 = r9;
+  reg64_t reg_ptr_ht = r10;
+  mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]);
+  mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]);
+  mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]);
+  ymm_t ymm_one = ymm_t(0);
+
+  if (id_ == 2) {
+    reg64_t reg_ptr_tmp = r11;
+    mov(reg_ptr_tmp, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]);
+  }
+  int offset = 0;
+  int d = num_ * sizeof(float);
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    ymm_t ymm_u = ymm_t(1);
+    ymm_t ymm_r = ymm_t(2);
+    ymm_t ymm_s = ymm_t(3);
+    ymm_t ymm_ht_1 = ymm_t(4);
+    // W: {W_update, W_reset; W_state}
+    if (id_ == 0 || id_ == 2) {
+      vmovups(ymm_u, ptr[reg_ptr_gates + offset]);
+      vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]);
+    }
+    if (id_ == 1) {
+      vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]);
+    }
+    if (id_ == 1 || id_ == 2) {
+      vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]);
+    }
+
+    if (id_ == 0) {
+      // ht = act_gate(u) * act_cand(s)
+      act<ymm_t>(ymm_u, ymm_u, act_gate_);
+      act<ymm_t>(ymm_s, ymm_s, act_cand_);
+      vmulps(ymm_s, ymm_s, ymm_u);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_s);
+    } else if (id_ == 1) {
+      // ht = act_gate(r) * ht_1
+      act<ymm_t>(ymm_r, ymm_r, act_gate_);
+      vmulps(ymm_r, ymm_r, ymm_ht_1);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_r);
+    } else if (id_ == 2) {
+      // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
+      ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx());
+      act<ymm_t>(ymm_u, ymm_u, act_gate_);
+      act<ymm_t>(ymm_s, ymm_s, act_cand_);
+      vmulps(ymm_s, ymm_s, ymm_u);
+      vsubps(ymm_u, ymm_one_inner, ymm_u);
+      vmulps(ymm_u, ymm_ht_1, ymm_u);
+      vaddps(ymm_u, ymm_s, ymm_u);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_u);
+    }
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+
+  ret();
+}
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math

--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <string>
 #include "paddle/fluid/operators/math/jit_gen.h"
+#include "paddle/fluid/operators/math/jit_kernel_impl.h"
 #include "paddle/fluid/platform/cpu_info.h"

 namespace paddle {
@@ -46,14 +47,6 @@ extern const float exp_float_consts[];
 extern const int exp_int_0x7f[];
 extern int g_tmp_mem[];

-// TODO(TJ): move these to some proper place
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-#define XMM_FLOAT_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define ZMM_FLOAT_BLOCK 16
-
 #define ALIGN32 __attribute__((aligned(32)))
 #define EXP_HIG 88.3762626647949f
 #define EXP_LOW -88.3762626647949f
@@ -176,31 +169,34 @@ class VActJitCode : public JitCode {
 protected:
  // compute relu with ymm, xmm
  template <typename JMM>
-  void relu_jmm(JMM& dst, JMM& src, JMM& zero) {  // NOLINT
+  void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) {  // NOLINT
+    JMM zero = JMM(zero_idx);
+    vxorps(zero, zero, zero);
    vmaxps(dst, src, zero);
  }

  // compute exp with ymm, xmm
  template <typename JMM>
-  void exp_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3,  // NOLINT
-               int mask_idx = 4, int tmp_idx = 5) {
+  void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12,  // NOLINT
+               int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) {
    using namespace platform::jit;  // NOLINT
-    assert(src.getIdx() != dst.getIdx());  // TODO(TJ): use enfore
    // check all idx can not equal
+    JMM jmm_src = JMM(src_idx);
    JMM jmm_fx = JMM(fx_idx);
    JMM jmm_fy = JMM(fy_idx);
    JMM jmm_mask = JMM(mask_idx);
    JMM jmm_tmp = JMM(tmp_idx);
    reg64_t reg_ptr_global = rax;
    push(reg_ptr_global);
+    vmovaps(jmm_src, src);
    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
-    vminps(src, src, jmm_tmp);
+    vminps(jmm_src, jmm_src, jmm_tmp);
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
-    vmaxps(src, src, jmm_tmp);
+    vmaxps(jmm_src, jmm_src, jmm_tmp);
    // express exp(x) as exp(g + n*log(2))
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
-    vmulps(jmm_fx, src, jmm_tmp);
+    vmulps(jmm_fx, jmm_src, jmm_tmp);
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
    vaddps(jmm_fx, jmm_fx, jmm_tmp);
    vroundps(jmm_fy, jmm_fx, 0x01);
@@ -214,21 +210,21 @@ class VActJitCode : public JitCode {
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
    JMM ymm_z = JMM(jmm_mask.getIdx());
    vmulps(ymm_z, jmm_fx, jmm_tmp);
-    vsubps(src, src, jmm_fy);
-    vsubps(src, src, ymm_z);
-    vmulps(ymm_z, src, src);
+    vsubps(jmm_src, jmm_src, jmm_fy);
+    vsubps(jmm_src, jmm_src, ymm_z);
+    vmulps(ymm_z, jmm_src, jmm_src);
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
-    vmulps(dst, src, jmm_tmp);
+    vmulps(dst, jmm_src, jmm_tmp);
    for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
         i += (YMM_FLOAT_BLOCK * sizeof(float))) {
      vmovaps(jmm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
      vaddps(dst, dst, jmm_tmp);
-      vmulps(dst, dst, src);
+      vmulps(dst, dst, jmm_src);
    }
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
    vaddps(dst, dst, jmm_tmp);
    vmulps(dst, dst, ymm_z);
-    vaddps(dst, dst, src);
+    vaddps(dst, dst, jmm_src);
    vmovaps(jmm_tmp, ptr[reg_ptr_global]);
    vaddps(dst, dst, jmm_tmp);
    // build 2^n
@@ -265,20 +261,23 @@ class VActJitCode : public JitCode {

  // compute sigmoid with ymm, xmm
  template <typename JMM>
-  void sigmoid_jmm(JMM& dst, JMM& src, int fx_idx = 2,  // NOLINT
-                   int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5) {
+  void sigmoid_jmm(JMM& dst, JMM& src, int src_idx = 11,  // NOLINT
+                   int fx_idx = 12, int fy_idx = 13, int mask_idx = 14,
+                   int tmp_idx = 15) {
    // y = 1 / (1 + e^-x)
    JMM jmm_tmp = JMM(tmp_idx);
+    JMM jmm_src = JMM(src_idx);
    reg64_t reg_ptr_global = rax;
    push(reg_ptr_global);
+    vmovaps(jmm_src, src);
    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
-    vminps(src, src, jmm_tmp);
+    vminps(jmm_src, jmm_src, jmm_tmp);
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
-    vmaxps(src, src, jmm_tmp);
+    vmaxps(jmm_src, jmm_src, jmm_tmp);
    vxorps(jmm_tmp, jmm_tmp, jmm_tmp);
-    vsubps(src, jmm_tmp, src);
-    exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
+    vsubps(jmm_src, jmm_tmp, jmm_src);
+    exp_jmm<JMM>(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx);
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
    vaddps(dst, dst, jmm_tmp);
    vdivps(dst, jmm_tmp, dst);
@@ -287,19 +286,22 @@ class VActJitCode : public JitCode {

  // compute tanh with ymm, xmm
  template <typename JMM>
-  void tanh_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3,  // NOLINT
-                int mask_idx = 4, int tmp_idx = 5) {
+  void tanh_jmm(JMM& dst, JMM& src, int src_idx = 11,  // NOLINT
+                int fx_idx = 12, int fy_idx = 13, int mask_idx = 14,
+                int tmp_idx = 15) {
    // y = 2 / (1 + e^(-2x)) - 1
+    JMM jmm_src = JMM(src_idx);
    JMM jmm_tmp = JMM(tmp_idx);
    JMM jmm_zero = JMM(mask_idx);
    reg64_t reg_ptr_global = rax;
    push(reg_ptr_global);
+    vmovaps(jmm_src, src);
    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
    vxorps(jmm_zero, jmm_zero, jmm_zero);
    vsubps(jmm_tmp, jmm_zero, jmm_tmp);
-    vmulps(src, src, jmm_tmp);
-    exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
+    vmulps(jmm_src, jmm_src, jmm_tmp);
+    exp_jmm<JMM>(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx);
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
    vaddps(dst, dst, jmm_tmp);
    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
@@ -309,6 +311,30 @@ class VActJitCode : public JitCode {
    pop(reg_ptr_global);
  }

+  template <typename JMM>
+  void act(JMM& dst, JMM& src, operand_type type) {  // NOLINT
+    // use 11~15
+    switch (type) {
+      case operand_type::relu:
+        relu_jmm<JMM>(dst, src, 15);
+        break;
+      case operand_type::exp:
+        exp_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
+        break;
+      case operand_type::sigmoid:
+        sigmoid_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
+        break;
+      case operand_type::tanh:
+        tanh_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
+        break;
+      case operand_type::identity:
+        break;
+      default:
+        // throw error
+        break;
+    }
+  }
+
 protected:
  int num_;
  operand_type type_;
@@ -322,6 +348,148 @@ class VActJitCode : public JitCode {
  ymm_t ymm_dst = ymm_t(1);
 };

+class LSTMJitCode : public VActJitCode {
+ public:
+  const char* name() const override {
+    std::string base = "LSTMJitCode";
+    if (use_peephole_) {
+      base += "_Peephole";
+    }
+    if (compute_c1h1_) {
+      base += "_C1H1";
+    }
+    auto AddTypeStr = [&](operand_type type) {
+      switch (type) {
+        case operand_type::relu:
+          base += "_Relu";
+          break;
+        case operand_type::exp:
+          base += "_Exp";
+          break;
+        case operand_type::sigmoid:
+          base += "_Sigmoid";
+          break;
+        case operand_type::tanh:
+          base += "_Tanh";
+          break;
+        case operand_type::identity:
+          base += "_Identity";
+          break;
+        default:
+          break;
+      }
+    };
+    AddTypeStr(act_gate_);
+    AddTypeStr(act_cand_);
+    AddTypeStr(act_cell_);
+    return base.c_str();
+  }
+
+  explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr,
+                       size_t code_size = 256 * 1024, void* code_ptr = nullptr)
+      : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size,
+                    code_ptr),
+        compute_c1h1_(compute_c1h1) {
+    auto typeExchange = [](const std::string& type) -> gen::operand_type {
+      if (type == "sigmoid") {
+        return operand_type::sigmoid;
+      } else if (type == "relu") {
+        return operand_type::relu;
+      } else if (type == "tanh") {
+        return operand_type::tanh;
+      } else if (type == "identity" || type == "") {
+        return operand_type::identity;
+      }  // else throw error
+      return operand_type::identity;
+    };
+    num_ = attr.d;
+    use_peephole_ = attr.use_peephole;
+    act_gate_ = typeExchange(attr.act_gate);
+    act_cand_ = typeExchange(attr.act_cand);
+    act_cell_ = typeExchange(attr.act_cell);
+  }
+  static bool init(int d);
+  void generate() override;
+
+ protected:
+  int num_;
+  bool compute_c1h1_;
+  bool use_peephole_;
+  operand_type act_gate_;
+  operand_type act_cand_;
+  operand_type act_cell_;
+  reg64_t param1{abi_param1};
+};
+
+class GRUJitCode : public VActJitCode {
+ public:
+  const char* name() const override {
+    std::string base = "GRUJitCode";
+    if (id_ == 0) {
+      base += "_H1";
+    } else if (id_ == 1) {
+      base += "_HtPart1";
+    } else if (id_ == 2) {
+      base += "_HtPart2";
+    }
+    auto AddTypeStr = [&](operand_type type) {
+      switch (type) {
+        case operand_type::relu:
+          base += "_Relu";
+          break;
+        case operand_type::exp:
+          base += "_Exp";
+          break;
+        case operand_type::sigmoid:
+          base += "_Sigmoid";
+          break;
+        case operand_type::tanh:
+          base += "_Tanh";
+          break;
+        case operand_type::identity:
+          base += "_Identity";
+          break;
+        default:
+          break;
+      }
+    };
+    AddTypeStr(act_gate_);
+    AddTypeStr(act_cand_);
+    return base.c_str();
+  }
+
+  explicit GRUJitCode(int id, const gru_attr_t& attr,
+                      size_t code_size = 256 * 1024, void* code_ptr = nullptr)
+      : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size,
+                    code_ptr),
+        id_(id) {
+    auto typeExchange = [](const std::string& type) -> gen::operand_type {
+      if (type == "sigmoid") {
+        return operand_type::sigmoid;
+      } else if (type == "relu") {
+        return operand_type::relu;
+      } else if (type == "tanh") {
+        return operand_type::tanh;
+      } else if (type == "identity" || type == "") {
+        return operand_type::identity;
+      }  // else throw error
+      return operand_type::identity;
+    };
+    num_ = attr.d;
+    act_gate_ = typeExchange(attr.act_gate);
+    act_cand_ = typeExchange(attr.act_cand);
+  }
+  static bool init(int d);
+  void generate() override;
+
+ protected:
+  int id_;
+  int num_;
+  operand_type act_gate_;
+  operand_type act_cand_;
+  reg64_t param1{abi_param1};
+};
+
 #ifdef PADDLE_WITH_MKLDNN
 struct EltwiseMulnChw16cNC : public Xbyak::CodeGenerator {
  explicit EltwiseMulnChw16cNC(size_t code_size = 256 * 1024)

--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>  // for shared_ptr
 #include <string>
 #include <unordered_map>
+#include "paddle/fluid/operators/math/jit_kernel_impl.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/macros.h"

@@ -26,14 +27,7 @@ namespace operators {
 namespace math {
 namespace jitkernel {

-// TODO(TJ): move these to some proper place
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-#define XMM_FLOAT_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define ZMM_FLOAT_BLOCK 16
-
+// TODO(TJ): remove me
 typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block;

 class Kernel {
@@ -128,24 +122,18 @@ class VTanhKernel : public VActKernel<T> {};
 template <typename T>
 class LSTMKernel : public Kernel {
 public:
-  virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht,
-                           /* below only used in peephole*/
-                           const T *wp_data = nullptr,
-                           T *checked = nullptr) const = 0;
-
  // compute c1 and h1 without c0 or h0
-  virtual void ComputeC1H1(T *gates, T *ct, T *ht,
-                           /* below only used in peephole*/
-                           const T *wp_data = nullptr) const = 0;
+  void (*ComputeC1H1)(lstm_t *, const lstm_attr_t *);
+  void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *);
 };

 template <typename T>
 class GRUKernel : public Kernel {
 public:
  // compute h1 without h0
-  virtual void ComputeH1(T *gates, T *ht) const = 0;
-  virtual void ComputeHtPart1(T *gates, const T *ht_1, T *ht) const = 0;
-  virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0;
+  void (*ComputeH1)(gru_t *, const gru_attr_t *);
+  void (*ComputeHtPart1)(gru_t *, const gru_attr_t *);
+  void (*ComputeHtPart2)(gru_t *, const gru_attr_t *);
 };

 template <typename T>

--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/jit_kernel.h"
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#include "paddle/fluid/operators/math/jit_kernel_refer.h"
 #include "paddle/fluid/platform/enforce.h"

 #ifdef PADDLE_WITH_XBYAK
@@ -31,49 +32,6 @@ namespace math {
 namespace jitkernel {
 namespace jit = platform::jit;

-template <typename T>
-void VMulRefer(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-template <typename T>
-void VAddRefer(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-  }
-}
-
-template <typename T>
-void VAddReluRefer(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-    z[i] = z[i] > 0 ? z[i] : 0;
-  }
-}
-
-template <typename T>
-void VScalRefer(const T* a, const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a[0] * x[i];
-  }
-}
-
-template <typename T>
-void VAddBiasRefer(const T* a, const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a[0] + x[i];
-  }
-}
-
-template <typename T>
-void VReluRefer(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
-}
-
 #ifdef PADDLE_WITH_MKLML
 template <typename T>
 void VMulMKL(const T* x, const T* y, T* z, int n);
@@ -109,7 +67,7 @@ void VScalMKL<float>(const float* a, const float* x, float* y, int n) {
  if (x == y) {
    platform::dynload::cblas_sscal(n, *a, y, 1);
  } else {
-    VScalRefer<float>(a, x, y, n);
+    refer::VScal<float>(a, x, y, n);
  }
 }

@@ -118,7 +76,7 @@ void VScalMKL<double>(const double* a, const double* x, double* y, int n) {
  if (x == y) {
    platform::dynload::cblas_dscal(n, *a, y, 1);
  } else {
-    VScalRefer<double>(a, x, y, n);
+    refer::VScal<double>(a, x, y, n);
  }
 }

@@ -147,7 +105,7 @@ class VMulKernelImpl : public VMulKernel<T> {
      return;
    }
 #endif
-    this->Compute = VMulRefer<T>;
+    this->Compute = refer::VMul<T>;
  }

 #ifdef PADDLE_WITH_XBYAK
@@ -198,7 +156,7 @@ class VAddKernelImpl : public VAddKernel<T> {
      return;
    }
 #endif
-    this->Compute = VAddRefer<T>;
+    this->Compute = refer::VAdd<T>;
  }
 #ifdef PADDLE_WITH_XBYAK

@@ -280,7 +238,7 @@ class VAddReluKernelImpl : public VAddReluKernel<T> {
      return;
    }
 #endif
-    this->Compute = VAddReluRefer<T>;
+    this->Compute = refer::VAddRelu<T>;
  }
 #ifdef PADDLE_WITH_XBYAK

@@ -318,7 +276,7 @@ class VScalKernelImpl : public VScalKernel<T> {
      return;
    }
 #endif
-    this->Compute = VScalRefer<T>;
+    this->Compute = refer::VScal<T>;
  }
 #ifdef PADDLE_WITH_XBYAK

@@ -362,7 +320,7 @@ class VAddBiasKernelImpl : public VAddBiasKernel<T> {
    }
 #endif

-    this->Compute = VAddBiasRefer<T>;
+    this->Compute = refer::VAddBias<T>;
  }
 #ifdef PADDLE_WITH_XBYAK

@@ -396,7 +354,7 @@ class VReluKernelImpl : public VReluKernel<T> {
    }
 #endif

-    this->Compute = VReluRefer<T>;
+    this->Compute = refer::VRelu<T>;
  }
 #ifdef PADDLE_WITH_XBYAK

@@ -412,16 +370,13 @@ bool VReluKernelImpl<float>::useJIT(int d) {
 }
 #endif

-template <typename T>
-inline void VIdentityRefer(const T* x, T* y, int n) {}
-
 /* An empty JitKernel */
 template <typename T>
 class VIdentityKernelImpl : public VIdentityKernel<T> {
 public:
  JITKERNEL_DECLARE_STATIC_FUNC;
  explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() {
-    this->Compute = VIdentityRefer<T>;
+    this->Compute = refer::VIdentity<T>;
  }
 };


--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/math/jit_kernel.h"
-#include <cmath>  // for exp
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#include "paddle/fluid/operators/math/jit_kernel_refer.h"

 #ifdef PADDLE_WITH_XBYAK
 #include "paddle/fluid/operators/math/jit_code.h"
@@ -25,48 +25,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif

-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
 namespace jitkernel {
 namespace jit = platform::jit;

-// TODO(TJ): move refer codes to one file
-// Refer code only focus on correctness
-template <typename T>
-void VExpRefer(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-}
-
-template <typename T>
-void VSigmoidRefer(const T* x, T* y, int n) {
-  // y = 1 / (1 + e^-x)
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
-  }
-}
-
-template <typename T>
-void VTanhRefer(const T* x, T* y, int n) {
-  // y = 2 * sigmoid(2x) - 1
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * x[i];
-  }
-  VSigmoidRefer(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
-  }
-}
-
 #ifdef PADDLE_WITH_MKLML
 // try to use MKL to speedup
 template <typename T>
@@ -129,7 +93,7 @@ class VExpKernelImpl : public VExpKernel<T> {
      return;
    }
 #endif
-    this->Compute = VExpRefer<T>;
+    this->Compute = refer::VExp<T>;
  }

 #ifdef PADDLE_WITH_XBYAK
@@ -182,7 +146,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
      return;
    }
 #endif
-    this->Compute = VSigmoidRefer<T>;
+    this->Compute = refer::VSigmoid<T>;
  }

 #ifdef PADDLE_WITH_XBYAK
@@ -234,7 +198,7 @@ class VTanhKernelImpl : public VTanhKernel<T> {
      return;
    }
 #endif
-    this->Compute = VTanhRefer<T>;
+    this->Compute = refer::VTanh<T>;
  }

 #ifdef PADDLE_WITH_XBYAK
@@ -267,154 +231,6 @@ REGISTER_JITKERNEL(vexp, VExpKernel);
 REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel);
 REGISTER_JITKERNEL(vtanh, VTanhKernel);

-namespace detail {
-
-#ifdef __AVX__
-
-#define ALIGN32 __attribute__((aligned(32)))
-
-#define _PS256_CONST(Name, Val)                                      \
-  static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
-                                                 Val, Val, Val, Val}
-
-#define _PI256_CONST(Name, Val)                                    \
-  static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
-                                               Val, Val, Val, Val}
-
-_PI256_CONST(0x7f, 0x7f);
-_PS256_CONST(one, 1.f);
-_PS256_CONST(0p5, 0.5f);
-_PS256_CONST(exp_hi, 88.3762626647949f);
-_PS256_CONST(exp_lo, -88.3762626647949f);
-_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
-_PS256_CONST(cephes_exp_C1, 0.693359375);
-_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
-_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
-_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
-_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
-_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
-_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
-_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
-
-typedef union imm_xmm_union {
-  __m256i imm;
-  __m128i xmm[2];
-} imm_xmm_union;
-
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
-  {                                         \
-    imm_xmm_union u ALIGN32;                \
-    u.imm = imm_;                           \
-    xmm0_ = u.xmm[0];                       \
-    xmm1_ = u.xmm[1];                       \
-  }
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
-  {                                         \
-    imm_xmm_union u ALIGN32;                \
-    u.xmm[0] = xmm0_;                       \
-    u.xmm[1] = xmm1_;                       \
-    imm_ = u.imm;                           \
-  }
-
-#define AVX2_BITOP_USING_SSE2(fn)                           \
-  static inline __m256i avx2_mm256_##fn(__m256i x, int y) { \
-    /* use SSE2 to perform the bitop AVX2 */                \
-    __m128i x1, x2;                                         \
-    __m256i ret;                                            \
-    COPY_IMM_TO_XMM(x, x1, x2);                             \
-    x1 = _mm_##fn(x1, y);                                   \
-    x2 = _mm_##fn(x2, y);                                   \
-    COPY_XMM_TO_IMM(x1, x2, ret);                           \
-    return ret;                                             \
-  }
-
-#define AVX2_INTOP_USING_SSE2(fn)                                    \
-  static inline __m256i avx2_mm256_add_epi32(__m256i x, __m256i y) { \
-    /* use SSE2 to perform the AVX2 integer operation */             \
-    __m128i x1, x2;                                                  \
-    __m128i y1, y2;                                                  \
-    __m256i ret;                                                     \
-    COPY_IMM_TO_XMM(x, x1, x2);                                      \
-    COPY_IMM_TO_XMM(y, y1, y2);                                      \
-    x1 = _mm_##fn(x1, y1);                                           \
-    x2 = _mm_##fn(x2, y2);                                           \
-    COPY_XMM_TO_IMM(x1, x2, ret);                                    \
-    return ret;                                                      \
-  }
-
-AVX2_BITOP_USING_SSE2(slli_epi32);
-AVX2_INTOP_USING_SSE2(add_epi32);
-
-#define AVXEXP_BASE                                                            \
-  __m256 tmp = _mm256_setzero_ps(), fx;                                        \
-  __m256 one = *reinterpret_cast<const __m256*>(_ps256_one);                   \
-  __m256i imm0;                                                                \
-  x = _mm256_min_ps(x, *reinterpret_cast<const __m256*>(_ps256_exp_hi));       \
-  x = _mm256_max_ps(x, *reinterpret_cast<const __m256*>(_ps256_exp_lo));       \
-  /* express exp(x) as exp(g + n*log(2)) */                                    \
-  fx = _mm256_mul_ps(x,                                                        \
-                     *reinterpret_cast<const __m256*>(_ps256_cephes_LOG2EF));  \
-  fx = _mm256_add_ps(fx, *reinterpret_cast<const __m256*>(_ps256_0p5));        \
-  tmp = _mm256_floor_ps(fx);                                                   \
-  /* if greater, substract 1 */                                                \
-  __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);                            \
-  mask = _mm256_and_ps(mask, one);                                             \
-  fx = _mm256_sub_ps(tmp, mask);                                               \
-  tmp = _mm256_mul_ps(fx,                                                      \
-                      *reinterpret_cast<const __m256*>(_ps256_cephes_exp_C1)); \
-  __m256 z = _mm256_mul_ps(                                                    \
-      fx, *reinterpret_cast<const __m256*>(_ps256_cephes_exp_C2));             \
-  x = _mm256_sub_ps(x, tmp);                                                   \
-  x = _mm256_sub_ps(x, z);                                                     \
-  z = _mm256_mul_ps(x, x);                                                     \
-  __m256 y = *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p0);           \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p1));   \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p2));   \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p3));   \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p4));   \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p5));   \
-  y = _mm256_mul_ps(y, z);                                                     \
-  y = _mm256_add_ps(y, x);                                                     \
-  y = _mm256_add_ps(y, one);                                                   \
-  /* build 2^n */                                                              \
-  imm0 = _mm256_cvttps_epi32(fx)
-
-__m256 ExpAVX(__m256 x) {
-  AVXEXP_BASE;
-  // two AVX2 instructions using SSE2
-  imm0 = avx2_mm256_add_epi32(imm0,
-                              *reinterpret_cast<const __m256i*>(_pi256_0x7f));
-  imm0 = avx2_mm256_slli_epi32(imm0, 23);
-  __m256 pow2n = _mm256_castsi256_ps(imm0);
-  y = _mm256_mul_ps(y, pow2n);
-  return y;
-}
-#endif
-
-#ifdef __AVX2__
-__m256 ExpAVX2(__m256 x) {
-  AVXEXP_BASE;
-  // two AVX2 instructions
-  imm0 = _mm256_add_epi32(imm0, *reinterpret_cast<const __m256i*>(_pi256_0x7f));
-  imm0 = _mm256_slli_epi32(imm0, 23);
-  __m256 pow2n = _mm256_castsi256_ps(imm0);
-  y = _mm256_mul_ps(y, pow2n);
-  return y;
-}
-#endif
-
-}  // namespace detail
 }  // namespace jitkernel
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/jit_kernel_impl.h
+++ b/paddle/fluid/operators/math/jit_kernel_impl.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <type_traits>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+#define XMM_FLOAT_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define ZMM_FLOAT_BLOCK 16
+
+typedef struct {
+  void* gates;  // gates: W_ch, W_ih, W_fh, W_oh
+  const void* ct_1;
+  void* ct;
+  void* ht;
+  /* weight_peephole and checked data are only used in peephole*/
+  const void* wp{nullptr};
+  void* checked{nullptr};
+} lstm_t;
+
+typedef struct {
+  void* gates;  // gates: {W_update, W_reset; W_state}
+  const void* ht_1;
+  void* ht;
+} gru_t;
+
+struct rnn_attr_s {
+  int d;
+  std::string act_gate, act_cand;
+  rnn_attr_s() = default;
+  rnn_attr_s(int _d, const std::string& _act_gate, const std::string& _act_cand)
+      : d(_d), act_gate(_act_gate), act_cand(_act_cand) {}
+};
+
+struct lstm_attr_s : public rnn_attr_s {
+  bool use_peephole;
+  std::string act_cell;
+  lstm_attr_s() = default;
+  lstm_attr_s(int _d, const std::string& _act_gate,
+              const std::string& _act_cand, const std::string& _act_cell,
+              bool _use_peephole = false)
+      : rnn_attr_s(_d, _act_gate, _act_cand),
+        use_peephole(_use_peephole),
+        act_cell(_act_cell) {}
+};
+
+typedef struct rnn_attr_s gru_attr_t;
+typedef struct lstm_attr_s lstm_attr_t;
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
@@ -82,10 +82,10 @@ namespace jitkernel {
 #define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name,     \
                                marco_declare, macro_find_key, macro_impl) \
  marco_define_name(ker_key, ker_class);                                   \
-  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, JITKERNEL_DECLARE,       \
-                                JITKERNEL_FIND_KEY, JITKERNEL_IMPL);       \
-  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, JITKERNEL_DECLARE,      \
-                                JITKERNEL_FIND_KEY, JITKERNEL_IMPL)
+  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, marco_declare,           \
+                                macro_find_key, macro_impl);               \
+  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, marco_declare,          \
+                                macro_find_key, macro_impl)

 #define REGISTER_JITKERNEL(ker_key, ker_class)                       \
  REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \

--- a/paddle/fluid/operators/math/jit_kernel_refer.h
+++ b/paddle/fluid/operators/math/jit_kernel_refer.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <string>
+#include "paddle/fluid/operators/math/jit_kernel_impl.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+namespace refer {
+/* Refer code only focus on correctness */
+
+template <typename T>
+void VMul(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+template <typename T>
+void VAdd(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+  }
+}
+
+template <typename T>
+void VAddRelu(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+    z[i] = z[i] > 0 ? z[i] : 0;
+  }
+}
+
+template <typename T>
+void VScal(const T* a, const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a[0] * x[i];
+  }
+}
+
+template <typename T>
+void VAddBias(const T* a, const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a[0] + x[i];
+  }
+}
+
+template <typename T>
+void VRelu(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <typename T>
+inline void VIdentity(const T* x, T* y, int n) {}
+
+template <typename T>
+void VExp(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+
+template <typename T>
+void VSigmoid(const T* x, T* y, int n) {
+  // y = 1 / (1 + e^-x)
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
+  }
+}
+
+template <typename T>
+void VTanh(const T* x, T* y, int n) {
+  // y = 2 * sigmoid(2x) - 1
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * x[i];
+  }
+  VSigmoid(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
+  }
+}
+
+template <typename T>
+void (*getActFunc(const std::string& type))(const T*, T*, int) {  // NOLINT
+  if (type == "sigmoid") {
+    return VSigmoid<T>;
+  } else if (type == "relu") {
+    return VRelu<T>;
+  } else if (type == "tanh") {
+    return VTanh<T>;
+  } else if (type == "identity" || type == "") {
+    return VIdentity<T>;
+  }
+  PADDLE_THROW("Not support type: %s", type);
+  return nullptr;
+}
+
+// compute ct and ht
+template <typename T>
+void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
+  T* ct = reinterpret_cast<T*>(step->ct);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* wp = reinterpret_cast<const T*>(step->wp);
+  T* checked = reinterpret_cast<T*>(step->checked);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  auto act_cell = getActFunc<T>(attr->act_cell);
+  int d = attr->d;
+  int d2 = d * 2;
+  int d3 = d * 3;
+  // gates: W_ch, W_ih, W_fh, W_oh
+  if (attr->use_peephole) {
+    VMul(wp, ct_1, checked, d);
+    VMul(wp + d, ct_1, checked + d, d);
+    VAdd(checked, gates + d, gates + d, d2);
+    act_gate(gates + d, gates + d, d2);
+  } else {
+    act_gate(gates + d, gates + d, d3);
+  }
+
+  // C_t = C_t-1 * fgated + cand_gated * igated
+  act_cand(gates, gates, d);
+  VMul(gates, gates + d, gates + d, d);
+  VMul(ct_1, gates + d2, gates + d2, d);
+  VAdd(gates + d, gates + d2, ct, d);
+
+  if (attr->use_peephole) {
+    // get ogated
+    VMul(wp + d2, ct, gates + d, d);
+    VAdd(gates + d, gates + d3, gates + d3, d);
+    act_gate(gates + d3, gates + d3, d);
+  }
+  // H_t = act_cell(C_t) * ogated
+  act_cell(ct, gates + d2, d);
+  VMul(gates + d2, gates + d3, ht, d);
+}
+
+// compute c1 and h1 without c0 or h0
+template <typename T>
+void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ct = reinterpret_cast<T*>(step->ct);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  auto act_cell = getActFunc<T>(attr->act_cell);
+  int d = attr->d;
+  int d2 = d * 2;
+  int d3 = d * 3;
+  /* C_t = igated * cgated*/
+  act_gate(gates + d, gates + d, d);
+  act_cand(gates, gates, d);
+  VMul(gates, gates + d, ct, d);
+  if (attr->use_peephole) {
+    // get outgated, put W_oc * C_t on igated
+    const T* wp = reinterpret_cast<const T*>(step->wp);
+    VMul(wp + d2, ct, gates + d, d);
+    VAdd(gates + d, gates + d3, gates + d3, d);
+  }
+  /* H_t = act_cell(C_t) * ogated */
+  act_gate(gates + d3, gates + d3, d);
+  act_cell(ct, gates + d2, d);
+  VMul(gates + d2, gates + d3, ht, d);
+}
+
+// compute h1 without h0
+template <typename T>
+void GRUH1(gru_t* step, const gru_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  int d = attr->d;
+  int d2 = d * 2;
+  act_gate(gates, gates, d);
+  act_cand(gates + d2, gates + d2, d);
+  VMul(gates, gates + d2, ht, d);
+}
+
+// compute the first part of GRU: ht = act_gate(r) * ht_1
+template <typename T>
+void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
+  // W: {W_update, W_reset; W_state}
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  act_gate(gates + attr->d, gates + attr->d, attr->d);
+  VMul(ht_1, gates + attr->d, ht, attr->d);
+}
+
+// compute the second part of GRU:
+// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
+template <typename T>
+void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  int d = attr->d;
+  T* y = gates + d * 2;
+  act_gate(gates, gates, d);
+  act_cand(y, y, d);
+  // out = zt*ht~ + (1-zt)*ht_1
+  for (int i = 0; i < d; ++i) {
+    ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
+  }
+}
+
+}  // namespace refer
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -15,470 +15,248 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/jit_kernel.h"
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#include "paddle/fluid/operators/math/jit_kernel_refer.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"

-#ifdef __AVX__
-#include <immintrin.h>
+#ifdef PADDLE_WITH_XBYAK
+#include "paddle/fluid/operators/math/jit_code.h"
 #endif

 namespace paddle {
 namespace operators {
 namespace math {
 namespace jitkernel {
-namespace detail {
-#ifdef __AVX__
-__m256 ExpAVX(__m256 x);
-#endif
-
-#ifdef __AVX2__
-__m256 ExpAVX2(__m256 x);
-#endif
-
-}  // namespace detail
-
-namespace jit = platform::jit;
-
-#ifdef __AVX__
-typedef enum { kSigmoid, kRelu, kTanh, kIdentity } act_type;

-class AVXAct {
- public:
-  virtual ~AVXAct() = default;
-  virtual __m256 Compute(__m256 x) const = 0;
-};
-
-template <act_type type, jit::cpu_isa_t isa>
-class AVXActImpl : public AVXAct {
+/* LSTM JitKernel */
+template <typename T>
+class LSTMKernelImpl : public LSTMKernel<T> {
 public:
-  __m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); }
-};
-
-#define AVX_SIGMOID(isa, expisa)                                 \
-  template <>                                                    \
-  __m256 AVXActImpl<kSigmoid, isa>::Compute(__m256 x) const {    \
-    __m256 ones = _mm256_set1_ps(1.0f);                          \
-    x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); \
-    x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); \
-    x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x);                  \
-    x = expisa(x);                                               \
-    x = _mm256_add_ps(ones, x);                                  \
-    return _mm256_div_ps(ones, x);                               \
-  }
-
-#define AVX_TANH(isa, expisa)                              \
-  template <>                                              \
-  __m256 AVXActImpl<kTanh, isa>::Compute(__m256 x) const { \
-    __m256 ones = _mm256_set1_ps(1.0f);                    \
-    x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x);           \
-    x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT));   \
-    x = expisa(x);                                         \
-    x = _mm256_add_ps(ones, x);                            \
-    x = _mm256_div_ps(_mm256_set1_ps(2.0f), x);            \
-    return _mm256_sub_ps(x, ones);                         \
+  static inline std::string name(const lstm_attr_t& attr) {
+    PADDLE_THROW("DType should be either float or double");
  }
-
-#define AVX_RELU(isa)                                      \
-  template <>                                              \
-  __m256 AVXActImpl<kRelu, isa>::Compute(__m256 x) const { \
-    return _mm256_max_ps(x, _mm256_setzero_ps());          \
+  static inline bool useJIT(int d) { return false; }
+  static inline bool useMKL(int d) { return false; }
+  explicit LSTMKernelImpl(const lstm_attr_t& attr) : LSTMKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(attr.d)) {
+      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8;
+      jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096));
+      this->ComputeCtHt =
+          jitcode0_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
+
+      jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096));
+      this->ComputeC1H1 =
+          jitcode1_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
+      return;
    }
+#endif

-#define AVX_IDENTITY(isa)                                      \
-  template <>                                                  \
-  __m256 AVXActImpl<kIdentity, isa>::Compute(__m256 x) const { \
-    return x;                                                  \
+    this->ComputeCtHt = refer::LSTMCtHt<T>;
+    this->ComputeC1H1 = refer::LSTMC1H1<T>;
  }

-#define FOR_EACH_AVX_ISA(macro_) \
-  macro_(jit::avx);              \
-  macro_(jit::avx2);             \
-  macro_(jit::avx512f)
-
-FOR_EACH_AVX_ISA(AVX_RELU);
-FOR_EACH_AVX_ISA(AVX_IDENTITY);
-
-AVX_SIGMOID(jit::avx, detail::ExpAVX);
-AVX_TANH(jit::avx, detail::ExpAVX);
+#ifdef PADDLE_WITH_XBYAK

-#ifdef __AVX2__
-AVX_SIGMOID(jit::avx2, detail::ExpAVX2);
-AVX_SIGMOID(jit::avx512f, detail::ExpAVX2);
-AVX_TANH(jit::avx2, detail::ExpAVX2);
-AVX_TANH(jit::avx512f, detail::ExpAVX2);
+ private:
+  std::unique_ptr<gen::LSTMJitCode> jitcode0_{nullptr}, jitcode1_{nullptr};
 #endif
+};

-#undef FOR_EACH_AVX_ISA
-#undef AVX_IDENTITY
-#undef AVX_RELU
-#undef AVX_TANH
-#undef AVX_SIGMOID
-
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool LSTMKernelImpl<float>::useJIT(int d) {
+  return gen::LSTMJitCode::init(d);
+}
 #endif

+/* Peephole JitKernel */
 template <typename T>
-static std::shared_ptr<const VActKernel<T>> GetActKernel(
-    const std::string& type, int n) {
-  if (type == "sigmoid") {
-    return std::dynamic_pointer_cast<const VActKernel<T>>(
-        KernelPool::Instance().template Get<VSigmoidKernel<T>>(n));
-  } else if (type == "relu") {
-    return std::dynamic_pointer_cast<const VActKernel<T>>(
-        KernelPool::Instance().template Get<VReluKernel<T>>(n));
-  } else if (type == "tanh") {
-    return std::dynamic_pointer_cast<const VActKernel<T>>(
-        KernelPool::Instance().template Get<VTanhKernel<T>>(n));
-  } else if (type == "identity" || type == "") {
-    return std::dynamic_pointer_cast<const VActKernel<T>>(
-        KernelPool::Instance().template Get<VIdentityKernel<T>>(n));
+class PeepholeKernelImpl : public LSTMKernel<T> {
+ public:
+  static inline std::string name(const lstm_attr_t& attr) {
+    PADDLE_THROW("DType should be either float or double");
  }
-  PADDLE_THROW("Not support type: %s", type);
-  return nullptr;
-}
-
-#ifdef __AVX__
-template <jit::cpu_isa_t isa>
-static std::unique_ptr<AVXAct> GetAVXAct(const std::string& type) {
-  if (type == "sigmoid") {
-    return std::unique_ptr<AVXAct>(new AVXActImpl<kSigmoid, isa>());
-  } else if (type == "relu") {
-    return std::unique_ptr<AVXAct>(new AVXActImpl<kRelu, isa>());
-  } else if (type == "tanh") {
-    return std::unique_ptr<AVXAct>(new AVXActImpl<kTanh, isa>());
-  } else if (type == "identity" || type == "") {
-    return std::unique_ptr<AVXAct>(new AVXActImpl<kIdentity, isa>());
+  static inline bool useJIT(int d) { return false; }
+  static inline bool useMKL(int d) { return false; }
+  explicit PeepholeKernelImpl(const lstm_attr_t& attr) : LSTMKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(attr.d)) {
+      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 4 * 8;
+      jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096));
+      this->ComputeCtHt =
+          jitcode0_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
+
+      jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096));
+      this->ComputeC1H1 =
+          jitcode1_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
+      return;
    }
-  PADDLE_THROW("Not support type: %s", type);
-  return nullptr;
-}
 #endif

-/* LSTM JitKernel */
-template <typename T, jit::cpu_isa_t isa, jit_block>
-class LSTMKernelImpl : public LSTMKernel<T> {
- public:
-  explicit LSTMKernelImpl(const std::string& act_gate,
-                          const std::string& act_cand,
-                          const std::string& act_cell, int d)
-      : LSTMKernel<T>() {
-    d_ = d;
-    d2_ = d * 2;
-    d3_ = d * 3;
-    act_gate_d3_ = GetActKernel<T>(act_gate, d3_);
-    act_gate_d_ = GetActKernel<T>(act_gate, d);
-    act_cand_d_ = GetActKernel<T>(act_cand, d);
-    act_cell_d_ = GetActKernel<T>(act_cell, d);
-    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
-    vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
+    this->ComputeCtHt = refer::LSTMCtHt<T>;
+    this->ComputeC1H1 = refer::LSTMC1H1<T>;
  }

-  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
-                   T* checked) const override {
-    // gates: W_ch, W_ih, W_fh, W_oh
-    act_gate_d3_->Compute(gates + d_, gates + d_, d3_);
-
-    /* C_t = C_t-1 * fgated + cand_gated * igated */
-    act_cand_d_->Compute(gates, gates, d_);
-    vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
-    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
-    vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
-
-    /* H_t = act_cell(C_t) * ogated */
-    act_cell_d_->Compute(ct, gates + d2_, d_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
-  }
-  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
-    /* C_t = igated * cgated*/
-    act_gate_d_->Compute(gates + d_, gates + d_, d_);
-    act_cand_d_->Compute(gates, gates, d_);
-    vmul_d_->Compute(gates, gates + d_, ct, d_);
-    /* H_t = act_cell(C_t) * ogated */
-    act_gate_d_->Compute(gates + d3_, gates + d3_, d_);
-    act_cell_d_->Compute(ct, gates + d2_, d_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
-  }
+#ifdef PADDLE_WITH_XBYAK

 private:
-  int d_, d2_, d3_;
-  std::shared_ptr<const VActKernel<T>> act_gate_d3_, act_gate_d_, act_cand_d_,
-      act_cell_d_;
-  std::shared_ptr<const VMulKernel<T>> vmul_d_;
-  std::shared_ptr<const VAddKernel<T>> vadd_d_;
-#ifdef __AVX__
-  std::unique_ptr<const AVXAct> avx_act_gate_, avx_act_cand_, avx_act_cell_;
+  std::unique_ptr<gen::LSTMJitCode> jitcode0_{nullptr}, jitcode1_{nullptr};
 #endif
 };

-#define INTRI8_FLOAT(isa)                                                    \
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool PeepholeKernelImpl<float>::useJIT(int d) {
+  return gen::LSTMJitCode::init(d);
+}
+#endif
+
+#define JITKERNEL_DEFINE_NAME_LSTM(ker_key, ker_class)                 \
  template <>                                                          \
-  LSTMKernelImpl<float, isa, kEQ8>::LSTMKernelImpl(                          \
-      const std::string& act_gate, const std::string& act_cand,              \
-      const std::string& act_cell, int d)                                    \
-      : LSTMKernel<float>() {                                                \
-    avx_act_gate_ = GetAVXAct<isa>(act_gate);                                \
-    avx_act_cand_ = GetAVXAct<isa>(act_cand);                                \
-    avx_act_cell_ = GetAVXAct<isa>(act_cell);                                \
+  std::string ker_class##Impl<float>::name(const lstm_attr_t& attr) {  \
+    std::string key(#ker_key "f");                                     \
+    key += (attr.act_gate + attr.act_cand + attr.act_cell +            \
+            (attr.use_peephole ? "p" : "n"));                          \
+    if (useJIT(attr.d)) {                                              \
+      /* only jit code need record d*/                                 \
+      return key + "jit" + std::to_string(attr.d);                     \
+    } else if (useMKL(attr.d)) {                                       \
+      return key + "mkl";                                              \
+    } else {                                                           \
+      return key + "any";                                              \
    }                                                                  \
-  template <>                                                                \
-  void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt(                        \
-      float* gates, const float* ct_1, float* ct, float* ht,                 \
-      const float* wp_data, float* checked) const {                          \
-    /* gates: W_ch, W_ih, W_fh, W_oh */                                      \
-    __m256 c, i, f, o;                                                       \
-    c = _mm256_loadu_ps(gates);                                              \
-    i = _mm256_loadu_ps(gates + 8);                                          \
-    f = _mm256_loadu_ps(gates + 16);                                         \
-    o = _mm256_loadu_ps(gates + 24);                                         \
-    /* C_t = C_t-1 * fgated + cand_gated * igated*/                          \
-    c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \
-    i = _mm256_loadu_ps(ct_1);                                               \
-    f = _mm256_mul_ps(i, avx_act_gate_->Compute(f));                         \
-    f = _mm256_add_ps(c, f);                                                 \
-    _mm256_storeu_ps(ct, f);                                                 \
-    /* H_t = act_cell(C_t) * ogated */                                       \
-    o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \
-    _mm256_storeu_ps(ht, o);                                                 \
  }                                                                    \
  template <>                                                          \
-  void LSTMKernelImpl<float, isa, kEQ8>::ComputeC1H1(                        \
-      float* gates, float* ct, float* ht, const float* wp_data) const {      \
-    __m256 c, i, o;                                                          \
-    c = _mm256_loadu_ps(gates);                                              \
-    i = _mm256_loadu_ps(gates + 8);                                          \
-    o = _mm256_loadu_ps(gates + 24);                                         \
-    /* C_t = igated * cgated*/                                               \
-    c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \
-    _mm256_storeu_ps(ct, c);                                                 \
-    /* H_t = act_cell(C_t) * ogated */                                       \
-    o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \
-    _mm256_storeu_ps(ht, o);                                                 \
-  }
-
-// TODO(TJ): optimize keq16
-
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
-#endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f);
-#endif
-
-/* Peephole JitKernel */
-template <typename T, jit::cpu_isa_t isa, jit_block>
-class PeepholeKernelImpl : public LSTMKernel<T> {
- public:
-  explicit PeepholeKernelImpl(const std::string& act_gate,
-                              const std::string& act_cand,
-                              const std::string& act_cell, int d)
-      : LSTMKernel<T>() {
-    d_ = d;
-    d2_ = d * 2;
-    d3_ = d * 3;
-    act_gate_d_ = GetActKernel<T>(act_gate, d);
-    act_cand_d_ = GetActKernel<T>(act_cand, d);
-    act_cell_d_ = GetActKernel<T>(act_cell, d);
-    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
-    vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
-    vadd_d2_ = KernelPool::Instance().template Get<VAddKernel<T>>(d2_);
-    act_gate_d2_ = GetActKernel<T>(act_gate, d2_);
-  }
-
-  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
-                   T* checked) const override {
-    /* get fgated and igated*/
-    vmul_d_->Compute(wp_data, ct_1, checked, d_);
-    vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_);
-    vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_);
-    act_gate_d2_->Compute(gates + d_, gates + d_, d2_);
-    /* C_t = C_t-1 * fgated + cand_gated * igated*/
-    act_cand_d_->Compute(gates, gates, d_);
-    vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
-    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
-    vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
-    /* get ogated*/
-    vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
-    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
-    act_gate_d_->Compute(gates + d3_, gates + d3_, d_);
-    /* H_t = act_cell(C_t) * ogated */
-    act_cell_d_->Compute(ct, gates + d2_, d_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
-  }
-
-  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
-    /* C_t = igated * cgated*/
-    act_gate_d_->Compute(gates + d_, gates + d_, d_);
-    act_cand_d_->Compute(gates, gates, d_);
-    vmul_d_->Compute(gates, gates + d_, ct, d_);
-    /* get outgated, put W_oc * C_t on igated */
-    vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
-    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
-    /* H_t = act_cell(C_t) * ogated */
-    act_gate_d_->Compute(gates + d3_, gates + d3_, d_);
-    act_cell_d_->Compute(ct, gates + d2_, d_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
+  std::string ker_class##Impl<double>::name(const lstm_attr_t& attr) { \
+    std::string key(#ker_key "d");                                     \
+    /* jit code do not support double yet*/                            \
+    if (useMKL(attr.d)) {                                              \
+      return key + "mkl";                                              \
+    } else {                                                           \
+      return key + "any";                                              \
+    }                                                                  \
  }

- private:
-  int d_, d2_, d3_;
-  std::shared_ptr<const VActKernel<T>> act_gate_d2_, act_gate_d_, act_cand_d_,
-      act_cell_d_;
-  std::shared_ptr<const VMulKernel<T>> vmul_d_;
-  std::shared_ptr<const VAddKernel<T>> vadd_d_, vadd_d2_;
-};
-
 #define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype)          \
  template <>                                                 \
  std::shared_ptr<const LSTMKernel<ker_dtype>>                \
-  KernelPool::Get<LSTMKernel<ker_dtype>, const std::string&,          \
-                  const std::string&, const std::string&, int, bool>( \
-      const std::string& act_gate, const std::string& act_cand,       \
-      const std::string& act_cell, int d, bool use_peephole)
+  KernelPool::Get<LSTMKernel<ker_dtype>, const lstm_attr_t&>( \
+      const lstm_attr_t& attr)

-#define JITKERNEL_KEY_LSTM(ker_key, dtype_key)                               \
-  #ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \
-                                       (use_peephole ? "p" : "n")
+#define JITKERNEL_FIND_KEY_LSTM(ker_class, ker_dtype) \
+  std::string key = ker_class##Impl<ker_dtype>::name(attr)

-#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k)                    \
-  if (use_peephole) {                                                  \
+#define JITKERNEL_LSTM_IMPL(ker, dtype)                     \
+  if (attr.use_peephole) {                                  \
    p = std::dynamic_pointer_cast<ker<dtype>>(              \
-        std::make_shared<PeepholeKernelImpl<dtype, isa, k>>(           \
-            act_gate, act_cand, act_cell, d));                         \
+        std::make_shared<PeepholeKernelImpl<dtype>>(attr)); \
  } else {                                                  \
    p = std::dynamic_pointer_cast<ker<dtype>>(              \
-        std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_cand, \
-                                                   act_cell, d));      \
+        std::make_shared<ker##Impl<dtype>>(attr));          \
  }

-REGISTER_JITKERNEL_ARGS_DEPRECATED(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM,
-                                   JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL);
+REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DEFINE_NAME_LSTM,
+                        JITKERNEL_DECLARE_LSTM, JITKERNEL_FIND_KEY_LSTM,
+                        JITKERNEL_LSTM_IMPL);

-#undef INTRI8_FLOAT
+#undef JITKERNEL_LSTM_IMPL
+#undef JITKERNEL_FIND_KEY_LSTM
 #undef JITKERNEL_DECLARE_LSTM
-#undef JITKERNEL_KEY_LSTM
-#undef JITKERNEL_NEW_LSTM_IMPL
+#undef JITKERNEL_DEFINE_NAME_LSTM

 /* GRU JitKernel */
-template <typename T, jit::cpu_isa_t isa, jit_block>
+template <typename T>
 class GRUKernelImpl : public GRUKernel<T> {
 public:
-  explicit GRUKernelImpl(const std::string& act_gate,
-                         const std::string& act_state, int d)
-      : GRUKernel<T>() {
-    d_ = d;
-    d2_ = d * 2;
-    act_gate_d2_ = GetActKernel<T>(act_gate, d2_);
-    act_gate_d_ = GetActKernel<T>(act_gate, d);
-    act_state_d_ = GetActKernel<T>(act_state, d);
-    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
-  }
-
-  void ComputeH1(T* gates, T* ht) const override {
-    act_gate_d_->Compute(gates, gates, d_);
-    act_state_d_->Compute(gates + d2_, gates + d2_, d_);
-    vmul_d_->Compute(gates, gates + d2_, ht, d_);
+  static inline std::string name(const gru_attr_t& attr) {
+    PADDLE_THROW("DType should be either float or double");
  }
-
-  void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override {
-    // W: {W_update, W_reset; W_state}
-    act_gate_d2_->Compute(gates, gates, d2_);
-    vmul_d_->Compute(ht_1, gates + d_, ht, d_);
-  }
-
-  void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override {
-    T* y = gates + d2_;
-    act_state_d_->Compute(y, y, d_);
-    // out = zt*ht~ + (1-zt)*ht_1
-    for (int i = 0; i < d_; ++i) {
-      ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
+  static inline bool useJIT(int d) { return false; }
+  static inline bool useMKL(int d) { return false; }
+  explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(attr.d)) {
+      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8;
+      jitcode0_.reset(new gen::GRUJitCode(0, attr, sz > 4096 ? sz : 4096));
+      this->ComputeH1 =
+          jitcode0_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
+
+      jitcode1_.reset(new gen::GRUJitCode(1, attr, sz > 4096 ? sz : 4096));
+      this->ComputeHtPart1 =
+          jitcode1_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
+
+      jitcode2_.reset(new gen::GRUJitCode(2, attr, sz > 4096 ? sz : 4096));
+      this->ComputeHtPart2 =
+          jitcode2_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
+      return;
    }
+#endif
+    this->ComputeH1 = refer::GRUH1<T>;
+    this->ComputeHtPart1 = refer::GRUHtPart1<T>;
+    this->ComputeHtPart2 = refer::GRUHtPart2<T>;
  }
+#ifdef PADDLE_WITH_XBYAK

 private:
-  int d_, d2_;
-  std::shared_ptr<const VActKernel<T>> act_gate_d2_, act_gate_d_, act_state_d_;
-  std::shared_ptr<const VMulKernel<T>> vmul_d_;
-#ifdef __AVX__
-  std::unique_ptr<const AVXAct> avx_act_gate_, avx_act_state_;
+  std::unique_ptr<gen::GRUJitCode> jitcode0_{nullptr}, jitcode1_{nullptr},
+      jitcode2_{nullptr};
 #endif
 };

-#define INTRI8_FLOAT(isa)                                                     \
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool GRUKernelImpl<float>::useJIT(int d) {
+  return gen::GRUJitCode::init(d);
+}
+#endif
+
+#define JITKERNEL_DEFINE_NAME_GRU(ker_key, ker_class)                 \
  template <>                                                         \
-  GRUKernelImpl<float, isa, kEQ8>::GRUKernelImpl(                             \
-      const std::string& act_gate, const std::string& act_state, int d)       \
-      : GRUKernel<float>() {                                                  \
-    avx_act_gate_ = GetAVXAct<isa>(act_gate);                                 \
-    avx_act_state_ = GetAVXAct<isa>(act_state);                               \
+  std::string ker_class##Impl<float>::name(const gru_attr_t& attr) {  \
+    std::string key(#ker_key "f");                                    \
+    key += (attr.act_gate + attr.act_cand);                           \
+    if (useJIT(attr.d)) {                                             \
+      /* only jit code need record d*/                                \
+      return key + "jit" + std::to_string(attr.d);                    \
+    } else if (useMKL(attr.d)) {                                      \
+      return key + "mkl";                                             \
+    } else {                                                          \
+      return key + "any";                                             \
    }                                                                 \
-  template <>                                                                 \
-  void GRUKernelImpl<float, isa, kEQ8>::ComputeH1(float* gates, float* ht)    \
-      const {                                                                 \
-    __m256 u, s;                                                              \
-    /* W: {W_update, W_reset; W_state} */                                     \
-    u = _mm256_loadu_ps(gates);                                               \
-    s = _mm256_loadu_ps(gates + 16);                                          \
-    s = _mm256_mul_ps(avx_act_gate_->Compute(u), avx_act_state_->Compute(s)); \
-    _mm256_storeu_ps(ht, s);                                                  \
  }                                                                   \
  template <>                                                         \
-  void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart1(                       \
-      float* gates, const float* ht_1, float* ht) const {                     \
-    /* not exactly equal the any implementation */                            \
-    __m256 r, ht0;                                                            \
-    r = _mm256_loadu_ps(gates + 8);                                           \
-    ht0 = _mm256_loadu_ps(ht_1);                                              \
-    r = _mm256_mul_ps(avx_act_gate_->Compute(r), ht0);                        \
-    _mm256_storeu_ps(ht, r);                                                  \
+  std::string ker_class##Impl<double>::name(const gru_attr_t& attr) { \
+    std::string key(#ker_key "d");                                    \
+    /* jit code do not support double yet*/                           \
+    if (useMKL(attr.d)) {                                             \
+      return key + "mkl";                                             \
+    } else {                                                          \
+      return key + "any";                                             \
    }                                                                 \
-  template <>                                                                 \
-  void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart2(                       \
-      float* gates, const float* ht_1, float* ht) const {                     \
-    /* not exactly equal the any implementation */                            \
-    __m256 u, s, ht0;                                                         \
-    u = _mm256_loadu_ps(gates);                                               \
-    s = _mm256_loadu_ps(gates + 16);                                          \
-    ht0 = _mm256_loadu_ps(ht_1);                                              \
-    u = avx_act_gate_->Compute(u);                                            \
-    s = _mm256_mul_ps(u, avx_act_state_->Compute(s));                         \
-    u = _mm256_sub_ps(_mm256_set1_ps(1.f), u);                                \
-    u = _mm256_mul_ps(u, ht0);                                                \
-    u = _mm256_add_ps(s, u);                                                  \
-    _mm256_storeu_ps(ht, u);                                                  \
  }

-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
-#endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f);
-#endif
-
 #define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype)         \
  template <>                                               \
-  std::shared_ptr<const GRUKernel<ker_dtype>> KernelPool::Get<            \
-      GRUKernel<ker_dtype>, const std::string&, const std::string&, int>( \
-      const std::string& act_gate, const std::string& act_state, int d)
+  std::shared_ptr<const ker_class<ker_dtype>>               \
+  KernelPool::Get<ker_class<ker_dtype>, const gru_attr_t&>( \
+      const gru_attr_t& attr)

-#define JITKERNEL_KEY_GRU(ker_key, dtype_key) \
-  #ker_key #dtype_key + std::to_string(d) + act_gate + act_state
+#define JITKERNEL_FIND_KEY_GRU(ker_class, ker_dtype) \
+  std::string key = ker_class##Impl<ker_dtype>::name(attr)

-#define JITKERNEL_NEW_GRU_IMPL(ker, dtype, isa, k) \
+#define JITKERNEL_GRU_IMPL(ker, dtype)       \
  p = std::dynamic_pointer_cast<ker<dtype>>( \
-      std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_state, d));
+      std::make_shared<ker##Impl<dtype>>(attr));

-REGISTER_JITKERNEL_ARGS_DEPRECATED(gru, GRUKernel, JITKERNEL_DECLARE_GRU,
-                                   JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL);
+REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DEFINE_NAME_GRU,
+                        JITKERNEL_DECLARE_GRU, JITKERNEL_FIND_KEY_GRU,
+                        JITKERNEL_GRU_IMPL);

-#undef INTRI8_FLOAT
-#undef JITKERNEL_NEW_GRU_IMPL
-#undef JITKERNEL_KEY_GRU
+#undef JITKERNEL_GRU_IMPL
+#undef JITKERNEL_FIND_KEY_GRU
 #undef JITKERNEL_DECLARE_GRU
+#undef JITKERNEL_DEFINE_NAME_GRU
 }  // namespace jitkernel
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/operators/math/jit_kernel_refer.h"
 #include "paddle/fluid/platform/port.h"

 #ifdef PADDLE_WITH_MKLML
@@ -53,12 +54,6 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
  }
 }

-void vrelu_ref(const int n, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0.f ? x[i] : 0.f;
-  }
-}
-
 #if defined __AVX__ || defined __AVX2__
 void vrelu_intri8(const int n, const float* x, float* y) {
  __m256 tmp = _mm256_loadu_ps(x);
@@ -69,6 +64,7 @@ void vrelu_intri8(const int n, const float* x, float* y) {

 TEST(JitKernel, vrelu) {
  namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
  for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) {
    std::vector<float> x(d);
    std::vector<float> zref(d), ztgt(d);
@@ -80,7 +76,7 @@ TEST(JitKernel, vrelu) {
    float* zref_data = zref.data();
    auto trefs = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      vrelu_ref(d, x_data, zref_data);
+      refer::VRelu<float>(x_data, zref_data, d);
    }
    auto trefe = GetCurrentUS();
 #if defined __AVX__ || defined __AVX2__
@@ -90,7 +86,7 @@ TEST(JitKernel, vrelu) {
        vrelu_intri8(d, x_data, zref_data);
      }
      auto si1 = GetCurrentUS();
-      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat << " us";
    }
 #endif
    auto ttgts = GetCurrentUS();
@@ -100,21 +96,16 @@ TEST(JitKernel, vrelu) {
    auto ttgte = GetCurrentUS();
    VLOG(30) << "Vec size " << d
             << ": refer takes: " << (trefe - trefs) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
    for (int i = 0; i < d; ++i) {
      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
    }
  }
 }

-void vaddbias_ref(const int n, const float a, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] + a;
-  }
-}
-
 TEST(JitKernel, vaddbias) {
  namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
  for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) {
    std::vector<float> x(d);
    std::vector<float> zref(d), ztgt(d);
@@ -127,7 +118,7 @@ TEST(JitKernel, vaddbias) {
    float* zref_data = zref.data();
    auto trefs = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      vaddbias_ref(d, a, x_data, zref_data);
+      refer::VAddBias<float>(&a, x_data, zref_data, d);
    }
    auto trefe = GetCurrentUS();
    auto ttgts = GetCurrentUS();
@@ -138,19 +129,13 @@ TEST(JitKernel, vaddbias) {

    VLOG(30) << "Vec size " << d
             << ": refer takes: " << (trefe - trefs) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
    for (int i = 0; i < d; ++i) {
      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
    }
  }
 }

-void vexp_ref(const int n, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-}
-
 #ifdef PADDLE_WITH_MKLML
 void vexp_mkl(const int n, const float* x, float* y) {
  paddle::platform::dynload::vsExp(n, x, y);
@@ -159,6 +144,7 @@ void vexp_mkl(const int n, const float* x, float* y) {

 TEST(JitKernel, vexp) {
  namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
  for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) {
    std::vector<float> x(d);
    std::vector<float> zref(d), ztgt(d);
@@ -170,7 +156,7 @@ TEST(JitKernel, vexp) {
    float* zref_data = zref.data();
    auto trefs = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      vexp_ref(d, x_data, zref_data);
+      refer::VExp<float>(x_data, zref_data, d);
    }
    auto trefe = GetCurrentUS();

@@ -196,26 +182,13 @@ TEST(JitKernel, vexp) {
 #else
             << " us, "
 #endif
-             << "tgt takes: " << (ttgte - ttgts) / repeat;
+             << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
    for (int i = 0; i < d; ++i) {
      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
    }
  }
 }

-inline float _sigmoid(float x) {
-  const float min = SIGMOID_THRESHOLD_MIN;
-  const float max = SIGMOID_THRESHOLD_MAX;
-  float tmp = (x < min) ? min : ((x > max) ? max : x);
-  return 1.f / (1.f + std::exp(-tmp));
-}
-
-void vsigmoid_ref(const int n, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = _sigmoid(x[i]);
-  }
-}
-
 void vsigmoid_better(
    const std::shared_ptr<
        const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp,
@@ -234,6 +207,7 @@ void vsigmoid_better(

 TEST(JitKernel, vsigmoid) {
  namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
  for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
    std::vector<float> x(d);
    std::vector<float> zref(d), ztgt(d);
@@ -252,7 +226,7 @@ TEST(JitKernel, vsigmoid) {
    auto tmkle = GetCurrentUS();
    auto trefs = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      vsigmoid_ref(d, x_data, zref_data);
+      refer::VSigmoid<float>(x_data, zref_data, d);
    }
    auto trefe = GetCurrentUS();
    auto ttgts = GetCurrentUS();
@@ -264,21 +238,13 @@ TEST(JitKernel, vsigmoid) {
    VLOG(30) << "Vec size " << d
             << ": refer takes: " << (trefe - trefs) / repeat
             << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
    for (int i = 0; i < d; ++i) {
      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
    }
  }
 }

-inline float _tanh(float x) { return 2.f * _sigmoid(2.f * x) - 1.f; }
-
-void vtanh_ref(const int n, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = _tanh(x[i]);
-  }
-}
-
 void vtanh_better(
    const std::shared_ptr<
        const paddle::operators::math::jitkernel::VScalKernel<float>>& vscal,
@@ -298,6 +264,7 @@ void vtanh_better(

 TEST(JitKernel, vtanh) {
  namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
  for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
    std::vector<float> x(d);
    std::vector<float> zref(d), ztgt(d);
@@ -320,7 +287,7 @@ TEST(JitKernel, vtanh) {
    auto tmkle = GetCurrentUS();
    auto trefs = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      vtanh_ref(d, x_data, zref_data);
+      refer::VTanh<float>(x_data, zref_data, d);
    }
    auto trefe = GetCurrentUS();
    auto ttgts = GetCurrentUS();
@@ -332,39 +299,13 @@ TEST(JitKernel, vtanh) {
    VLOG(30) << "Vec size " << d
             << ": refer takes: " << (trefe - trefs) / repeat
             << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
    for (int i = 0; i < d; ++i) {
      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
    }
  }
 }

-void lstm_ctht_ref(
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
-        vsigmoid_3d,
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VTanhKernel<float>>& vtanh_d,
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp_1,
-    const int d, float* gates, const float* ct_1, float* ct, float* ht) {
-  vsigmoid_3d->Compute(gates + d, gates + d, 3 * d);
-  vtanh_d->Compute(gates, gates, d);
-  const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3;
-  const float min = SIGMOID_THRESHOLD_MIN;
-  const float max = SIGMOID_THRESHOLD_MAX;
-  for (int k = 0; k < d; ++k) {
-    // C_t = C_t-1 * fgated + cand_gated * igated
-    ct[k] = ct_1[k] * f[k] + gates[k] * i[k];
-    // H_t = act_cell(C_t) * ogated
-    float tmp = ct[k] * 2;
-    tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
-    vexp_1->Compute(&tmp, &tmp, 1);
-    tmp = 2.f / (1.f + tmp) - 1.f;
-    ht[k] = tmp * o[k];
-  }
-}
-
 void lstm_ctht_better(
    const std::shared_ptr<
        const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
@@ -389,6 +330,7 @@ void lstm_ctht_better(

 TEST(JitKernel, lstm) {
  namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
  for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) {
    int d4 = d * 4;
    int d3 = d * 3;
@@ -399,19 +341,17 @@ TEST(JitKernel, lstm) {
    RandomVec<float>(d, ct_1.data(), -2.f, 2.f);
    memcpy(xref.data(), x.data(), sizeof(float) * d4);
    std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
+    const jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell, false);
    const auto& ker =
        jit::KernelPool::Instance()
-            .template Get<jit::LSTMKernel<float>, const std::string&,
-                          const std::string&, const std::string&>(
-                act_gate, act_cand, act_cell, d, false);
+            .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(
+                attr);
    // below kernels are used to compute refer
    const auto& vsigmoid_3d =
        jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(
            d3);
    const auto& vtanh_d =
        jit::KernelPool::Instance().template Get<jit::VTanhKernel<float>>(d);
-    const auto& vexp_1 =
-        jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(1);
    const auto& vmul_d =
        jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(d);
    const auto& vadd_d =
@@ -425,9 +365,17 @@ TEST(JitKernel, lstm) {
    float* ct_ref_data = ct_ref.data();
    float* ht_ref_data = ht_ref.data();
    // compute once to check correctness
-    lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data,
-                  ct_ref_data, ht_ref_data);
-    ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
+    jit::lstm_t step;
+    step.gates = xref_data;
+    step.ct_1 = ct_1_data;
+    step.ct = ct_ref_data;
+    step.ht = ht_ref_data;
+    refer::LSTMCtHt<float>(&step, &attr);
+
+    step.gates = x_data;
+    step.ct = ct_tgt_data;
+    step.ht = ht_tgt_data;
+    ker->ComputeCtHt(&step, &attr);
    for (int i = 0; i < d; ++i) {
      EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3);
      EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3);
@@ -441,32 +389,21 @@ TEST(JitKernel, lstm) {
    auto tmkle = GetCurrentUS();
    auto trefs = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data,
-                    ct_ref_data, ht_ref_data);
+      refer::LSTMCtHt<float>(&step, &attr);
    }
    auto trefe = GetCurrentUS();
    auto ttgts = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
+      ker->ComputeCtHt(&step, &attr);
    }
    auto ttgte = GetCurrentUS();
    VLOG(30) << "Vec size " << d
             << ": refer takes: " << (trefe - trefs) / repeat
             << " us, better(jit) takes: " << (tmkle - tmkls) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+             << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
  }
 }

-void vscal_ref(const int n, const float a, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a * x[i];
-  }
-}
-void vscal_inp_ref(const int n, const float a, float* x) {
-  for (int i = 0; i < n; ++i) {
-    x[i] = a * x[i];
-  }
-}
 #if defined __AVX__ || defined __AVX2__
 void vscal_intri8(const int n, const float a, const float* x, float* y) {
  __m256 tmp;
@@ -492,6 +429,7 @@ void vscal_inp_mkl(const int n, const float a, float* x) {

 TEST(JitKernel, vscal) {
  namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
    std::vector<float> x(d), y(d);
    std::vector<float> zref(d), ztgt(d);
@@ -506,12 +444,12 @@ TEST(JitKernel, vscal) {
    float* zref_data = zref.data();
    auto trefs = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      vscal_ref(d, a, x_data, zref_data);
+      refer::VScal<float>(&a, x_data, zref_data, d);
    }
    auto trefe = GetCurrentUS();
    auto trefs1 = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      vscal_inp_ref(d, a, y_data);
+      refer::VScal<float>(&a, y_data, y_data, d);
    }
    auto trefe1 = GetCurrentUS();

@@ -536,7 +474,7 @@ TEST(JitKernel, vscal) {
      }
      auto si3 = GetCurrentUS();
      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
-               << " us, inplace: " << (si3 - si2) / repeat;
+               << " us, inplace: " << (si3 - si2) / repeat << " us";
    }
 #endif

@@ -560,19 +498,14 @@ TEST(JitKernel, vscal) {
             << " us, "
 #endif
             << "tgt takes: " << (ttgte - ttgts) / repeat
-             << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat;
+             << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat
+             << " us";
    for (int i = 0; i < d; ++i) {
      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
    }
  }
 }

-void vmul_ref(const int n, const float* x, const float* y, float* z) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-}
-
 #if defined __AVX__ || defined __AVX2__
 void vmul_intri8(const int n, const float* x, const float* y, float* z) {
  __m256 tmpx, tmpy;
@@ -591,6 +524,7 @@ void vmul_mkl(const int n, const float* x, const float* y, float* z) {

 TEST(JitKernel, vmul) {
  namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
  for (int d : {7, 8, 15, 16, 20, 30, 256, 512, 1000, 1024}) {
    std::vector<float> x(d), y(d);
    std::vector<float> zref(d), ztgt(d);
@@ -604,7 +538,7 @@ TEST(JitKernel, vmul) {
    float* zref_data = zref.data();
    auto trefs = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      vmul_ref(d, x_data, y_data, zref_data);
+      refer::VMul<float>(x_data, y_data, zref_data, d);
    }
    auto trefe = GetCurrentUS();

@@ -640,19 +574,13 @@ TEST(JitKernel, vmul) {
 #else
             << " us, "
 #endif
-             << "tgt takes: " << (ttgte - ttgts) / repeat;
+             << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
    for (int i = 0; i < d; ++i) {
      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
    }
  }
 }

-void vadd_ref(const int n, const float* x, const float* y, float* z) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-  }
-}
-
 #if defined __AVX__ || defined __AVX2__
 void vadd_intri8(const int n, const float* x, const float* y, float* z) {
  __m256 tmpx, tmpy;
@@ -671,6 +599,7 @@ void vadd_mkl(const int n, const float* x, const float* y, float* z) {

 TEST(JitKernel, vadd) {
  namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
    std::vector<float> x(d), y(d);
    std::vector<float> zref(d), ztgt(d);
@@ -684,7 +613,7 @@ TEST(JitKernel, vadd) {
    float* zref_data = zref.data();
    auto trefs = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      vadd_ref(d, x_data, y_data, zref_data);
+      refer::VAdd<float>(x_data, y_data, zref_data, d);
    }
    auto trefe = GetCurrentUS();

@@ -720,19 +649,13 @@ TEST(JitKernel, vadd) {
 #else
             << " us, "
 #endif
-             << "tgt takes: " << (ttgte - ttgts) / repeat;
+             << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
    for (int i = 0; i < d; ++i) {
      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
    }
  }
 }

-void vaddrelu_ref(const int n, const float* x, const float* y, float* z) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-    z[i] = z[i] > 0 ? z[i] : 0;
-  }
-}
 void vaddrelu_better(
    const std::shared_ptr<
        const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd,
@@ -745,6 +668,7 @@ void vaddrelu_better(

 TEST(JitKernel, vaddrelu) {
  namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
    std::vector<float> x(d), y(d);
    std::vector<float> zref(d), ztgt(d);
@@ -762,7 +686,7 @@ TEST(JitKernel, vaddrelu) {
    float* zref_data = zref.data();
    auto trefs = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      vaddrelu_ref(d, x_data, y_data, zref_data);
+      refer::VAddRelu<float>(x_data, y_data, zref_data, d);
    }
    auto trefe = GetCurrentUS();
    auto tmkls = GetCurrentUS();
@@ -778,7 +702,7 @@ TEST(JitKernel, vaddrelu) {
    VLOG(30) << "Vec size " << d
             << ": refer takes: " << (trefe - trefs) / repeat
             << " us, better takes: " << (tmkle - tmkls) / repeat << " us, "
-             << "tgt takes: " << (ttgte - ttgts) / repeat;
+             << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
    for (int i = 0; i < d; ++i) {
      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
    }
@@ -789,21 +713,23 @@ TEST(JitKernel, pool) {
  namespace jit = paddle::operators::math::jitkernel;
  const int frame_size = 4;
  std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
+  jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false);
+
+  // empty call it to avoid unknown flag 'use_pinned_memory' on Mac
+  paddle::platform::jit::MayIUse(paddle::platform::jit::avx);
  const auto& plstm1 =
      jit::KernelPool::Instance()
-          .template Get<jit::LSTMKernel<float>, const std::string&,
-                        const std::string&, const std::string&>(
-              act_gate, act_cand, act_cell, frame_size, false);
+          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);
+
  const auto& plstm2 =
      jit::KernelPool::Instance()
-          .template Get<jit::LSTMKernel<float>, const std::string&,
-                        const std::string&, const std::string&>(
-              act_gate, act_cand, act_cell, frame_size, false);
+          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);
+  EXPECT_EQ(plstm1, plstm2);
+
  const auto& peephole =
      jit::KernelPool::Instance()
-          .template Get<jit::LSTMKernel<float>, const std::string&,
-                        const std::string&, const std::string&>(
-              act_gate, act_cand, act_cell, frame_size, true);
+          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(
+              jit::lstm_attr_t(frame_size, act_gate, act_cand, act_cell, true));
  EXPECT_TRUE(plstm1 != peephole);

  const auto& pvmul_f =

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -671,6 +671,55 @@ EOF
    ${DOCKERFILE_CUBLAS_DSO}
    ${DOCKERFILE_GPU_ENV}
    ENV NCCL_LAUNCH_MODE PARALLEL
+EOF
+    elif [ "$1" == "cp36-cp36m" ]; then
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
+        tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
+        ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
+        wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
+        tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && \
+        pip3.6 install opencv-python && pip3.6 install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        ${PADDLE_VERSION} && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ENV NCCL_LAUNCH_MODE PARALLEL
+EOF
+    elif [ "$1" == "cp37-cp37m" ]; then
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
+        tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && \
+        pip3.7 install opencv-python && pip3.7 install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        ${PADDLE_VERSION} && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ENV NCCL_LAUNCH_MODE PARALLEL
 EOF
    else
        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6022,9 +6022,10 @@ def image_resize(input,
        raise ValueError(
            "The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
        )
+    resample_type = resample_methods[resample]
    if out_shape is None and scale is None:
        raise ValueError("One of out_shape and scale must not be None.")
-    helper = LayerHelper('interpolate', **locals())
+    helper = LayerHelper('{}_interp'.format(resample_type), **locals())
    dtype = helper.input_dtype()

    def _is_list_or_turple_(data):
@@ -6058,18 +6059,16 @@ def image_resize(input,

    out = helper.create_variable_for_type_inference(dtype)
    helper.append_op(
-        type='interpolate',
+        type='{}_interp'.format(resample_type),
        inputs=inputs,
        outputs={"Out": out},
-        attrs={
-            "out_h": out_h,
+        attrs={"out_h": out_h,
               "out_w": out_w,
-            "interp_method": resample_methods[resample]
-        })
+               "interp_method": resample_type})
    return out


-@templatedoc(op_type="interpolate")
+@templatedoc(op_type="bilinear_interp")
 def resize_bilinear(input,
                    out_shape=None,
                    scale=None,
@@ -6125,7 +6124,7 @@ def resize_bilinear(input,
    return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape)


-@templatedoc(op_type="interpolate")
+@templatedoc(op_type="nearest_interp")
 def resize_nearest(input,
                   out_shape=None,
                   scale=None,
@@ -6990,6 +6989,13 @@ def elu(x, alpha=1.0, name=None):

    Returns:
        output(${out_type}): ${out_comment}
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
+            y = fluid.layers.elu(x, alpha=0.2)
    """
    helper = LayerHelper('elu', **locals())
    out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7013,6 +7019,13 @@ def relu6(x, threshold=6.0, name=None):

    Returns:
        output(${out_type}): ${out_comment}
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
+            y = fluid.layers.relu6(x, threshold=6.0)
    """
    helper = LayerHelper('relu6', **locals())
    out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7036,6 +7049,13 @@ def pow(x, factor=1.0, name=None):

    Returns:
        output(${out_type}): ${out_comment}
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
+            y = fluid.layers.pow(x, factor=2.0)
    """
    helper = LayerHelper('pow', **locals())
    out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7060,6 +7080,13 @@ def stanh(x, scale_a=2.0 / 3.0, scale_b=1.7159, name=None):

    Returns:
        output(${out_type}): ${out_comment}
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
+            y = fluid.layers.stanh(x, scale_a=0.67, scale_b=1.72)
    """
    helper = LayerHelper('stanh', **locals())
    out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7085,6 +7112,13 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):

    Returns:
        output(${out_type}): ${out_comment}
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
+            y = fluid.layers.hard_sigmoid(x, slope=0.3, offset=0.8)
    """
    helper = LayerHelper('hard_sigmoid', **locals())
    out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7109,6 +7143,13 @@ def swish(x, beta=1.0, name=None):

    Returns:
        output(${out_type}): ${out_comment}
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
+            y = fluid.layers.swish(x, beta=2.0)
    """
    helper = LayerHelper('swish', **locals())
    out = helper.create_variable_for_type_inference(dtype=x.dtype)

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -81,25 +81,27 @@ list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
-list(REMOVE_ITEM TEST_OPS test_interpolate_op)
+list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
+list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
-py_test_modules(test_interpolate_op MODULES test_interpolate_op SERIAL)
+py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
+py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
 if(WITH_DISTRIBUTE)
    py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
    if(NOT APPLE)
        set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
        set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
-        py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
-        set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
-        # FIXME(typhoonzero): add this back
-	#py_test_modules(test_dist_transformer MODULES test_dist_transformer)
-	#set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
+        # FIXME(typhoonzero): add these tests back
+	# py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
+	# set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
+	# py_test_modules(test_dist_transformer MODULES test_dist_transformer)
+	# set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
        # TODO(typhoonzero): make dist test parallel when fix port management issue
-        set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_se_resnext test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE)
+        set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE)
    endif(NOT APPLE)
    py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()

--- a/python/paddle/fluid/tests/unittests/test_interpolate_op.py
+++ b/python/paddle/fluid/tests/unittests/test_interpolate_op.py
@@ -20,36 +20,6 @@ from op_test import OpTest
 import paddle.fluid.core as core


-def nearest_neighbor_interp_np(X,
-                               out_h,
-                               out_w,
-                               out_size=None,
-                               actual_shape=None):
-    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    if actual_shape is not None:
-        out_h = actual_shape[0]
-        out_w = actual_shape[1]
-    n, c, in_h, in_w = X.shape
-
-    ratio_h = ratio_w = 0.0
-    if out_h > 1:
-        ratio_h = (in_h - 1.0) / (out_h - 1.0)
-    if out_w > 1:
-        ratio_w = (in_w - 1.0) / (out_w - 1.0)
-
-    out = np.zeros((n, c, out_h, out_w))
-    for i in range(out_h):
-        in_i = int(ratio_h * i + 0.5)
-        for j in range(out_w):
-            in_j = int(ratio_w * j + 0.5)
-            out[:, :, i, j] = X[:, :, in_i, in_j]
-
-    return out.astype(X.dtype)
-
-
 def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
    """bilinear interpolation implement in shape [N, C, H, W]"""
    if out_size is not None:
@@ -87,22 +57,16 @@ def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
    return out.astype(input.dtype)


-INTERPOLATE_FUNCS = {
-    'bilinear': bilinear_interp_np,
-    'nearest': nearest_neighbor_interp_np,
-}
-
-
-class TestInterpolateOp(OpTest):
+class TestBilinearInterpOp(OpTest):
    def setUp(self):
        self.out_size = None
        self.actual_shape = None
        self.init_test_case()
-        self.op_type = "interpolate"
+        self.op_type = "bilinear_interp"
        input_np = np.random.random(self.input_shape).astype("float32")

-        output_np = INTERPOLATE_FUNCS[self.interp_method](
-            input_np, self.out_h, self.out_w, self.out_size, self.actual_shape)
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size, self.actual_shape)
        self.inputs = {'X': input_np}
        if self.out_size is not None:
            self.inputs['OutSize'] = self.out_size
@@ -129,7 +93,7 @@ class TestInterpolateOp(OpTest):
        self.out_size = np.array([3, 3]).astype("int32")


-class TestBilinearInterpCase1(TestInterpolateOp):
+class TestBilinearInterpCase1(TestBilinearInterpOp):
    def init_test_case(self):
        self.interp_method = 'bilinear'
        self.input_shape = [4, 1, 7, 8]
@@ -137,7 +101,7 @@ class TestBilinearInterpCase1(TestInterpolateOp):
        self.out_w = 1


-class TestBilinearInterpCase2(TestInterpolateOp):
+class TestBilinearInterpCase2(TestBilinearInterpOp):
    def init_test_case(self):
        self.interp_method = 'bilinear'
        self.input_shape = [3, 3, 9, 6]
@@ -145,7 +109,7 @@ class TestBilinearInterpCase2(TestInterpolateOp):
        self.out_w = 12


-class TestBilinearInterpCase3(TestInterpolateOp):
+class TestBilinearInterpCase3(TestBilinearInterpOp):
    def init_test_case(self):
        self.interp_method = 'bilinear'
        self.input_shape = [1, 1, 128, 64]
@@ -153,7 +117,7 @@ class TestBilinearInterpCase3(TestInterpolateOp):
        self.out_w = 128


-class TestBilinearInterpCase4(TestInterpolateOp):
+class TestBilinearInterpCase4(TestBilinearInterpOp):
    def init_test_case(self):
        self.interp_method = 'bilinear'
        self.input_shape = [4, 1, 7, 8]
@@ -162,7 +126,7 @@ class TestBilinearInterpCase4(TestInterpolateOp):
        self.out_size = np.array([2, 2]).astype("int32")


-class TestBilinearInterpCase5(TestInterpolateOp):
+class TestBilinearInterpCase5(TestBilinearInterpOp):
    def init_test_case(self):
        self.interp_method = 'bilinear'
        self.input_shape = [3, 3, 9, 6]
@@ -171,7 +135,7 @@ class TestBilinearInterpCase5(TestInterpolateOp):
        self.out_size = np.array([11, 11]).astype("int32")


-class TestBilinearInterpCase6(TestInterpolateOp):
+class TestBilinearInterpCase6(TestBilinearInterpOp):
    def init_test_case(self):
        self.interp_method = 'bilinear'
        self.input_shape = [1, 1, 128, 64]
@@ -180,7 +144,7 @@ class TestBilinearInterpCase6(TestInterpolateOp):
        self.out_size = np.array([65, 129]).astype("int32")


-class TestBilinearInterpActualShape(TestInterpolateOp):
+class TestBilinearInterpActualShape(TestBilinearInterpOp):
    def init_test_case(self):
        self.interp_method = 'bilinear'
        self.input_shape = [3, 2, 32, 16]
@@ -189,25 +153,16 @@ class TestBilinearInterpActualShape(TestInterpolateOp):
        self.out_size = np.array([66, 40]).astype("int32")


-class TestBilinearInterpBigScale(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [4, 4, 64, 32]
-        self.out_h = 100
-        self.out_w = 50
-        self.out_size = np.array([101, 51]).astype('int32')
-
-
-class TestInterpolateOpUint8(OpTest):
+class TestBilinearInterpOpUint8(OpTest):
    def setUp(self):
        self.out_size = None
        self.actual_shape = None
        self.init_test_case()
-        self.op_type = "interpolate"
+        self.op_type = "bilinear_interp"
        input_np = np.random.randint(
            low=0, high=256, size=self.input_shape).astype("uint8")
-        output_np = INTERPOLATE_FUNCS[self.interp_method](
-            input_np, self.out_h, self.out_w, self.out_size, self.actual_shape)
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size, self.actual_shape)
        self.inputs = {'X': input_np}
        if self.out_size is not None:
            self.inputs['OutSize'] = self.out_size
@@ -228,7 +183,7 @@ class TestInterpolateOpUint8(OpTest):
        self.out_w = 9


-class TestBilinearInterpCase1Uint8(TestInterpolateOpUint8):
+class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
    def init_test_case(self):
        self.interp_method = 'bilinear'
        self.input_shape = [2, 3, 128, 64]
@@ -236,7 +191,7 @@ class TestBilinearInterpCase1Uint8(TestInterpolateOpUint8):
        self.out_w = 50


-class TestBilinearInterpCase2Uint8(TestInterpolateOpUint8):
+class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
    def init_test_case(self):
        self.interp_method = 'bilinear'
        self.input_shape = [4, 1, 7, 8]
@@ -245,91 +200,5 @@ class TestBilinearInterpCase2Uint8(TestInterpolateOpUint8):
        self.out_size = np.array([6, 15]).astype("int32")


-class TestNearestNeighborInterpCase1(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-
-
-class TestNearestNeighborInterpCase2(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-
-
-class TestNearestNeighborInterpCase3(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 128, 64]
-        self.out_h = 64
-        self.out_w = 128
-
-
-class TestNearestNeighborInterpCase4(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.out_size = np.array([2, 2]).astype("int32")
-
-
-class TestNearestNeighborInterpCase5(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.out_size = np.array([11, 11]).astype("int32")
-
-
-class TestNearestNeighborInterpCase6(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 128, 64]
-        self.out_h = 64
-        self.out_w = 128
-        self.out_size = np.array([65, 129]).astype("int32")
-
-
-class TestNearestNeighborInterpActualShape(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.out_size = np.array([66, 40]).astype("int32")
-
-
-class TestNearestNeighborInterpBigScale(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 4, 64, 32]
-        self.out_h = 100
-        self.out_w = 50
-        self.out_size = np.array([101, 51]).astype('int32')
-
-
-class TestNearestNeighborInterpCase1Uint8(TestInterpolateOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 128, 64]
-        self.out_h = 120
-        self.out_w = 50
-
-
-class TestNearestNeighborInterpCase2Uint8(TestInterpolateOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 5
-        self.out_w = 13
-        self.out_size = np.array([6, 15]).astype("int32")
-
-
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+def nearest_neighbor_interp_np(X,
+                               out_h,
+                               out_w,
+                               out_size=None,
+                               actual_shape=None):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    n, c, in_h, in_w = X.shape
+
+    ratio_h = ratio_w = 0.0
+    if out_h > 1:
+        ratio_h = (in_h - 1.0) / (out_h - 1.0)
+    if out_w > 1:
+        ratio_w = (in_w - 1.0) / (out_w - 1.0)
+
+    out = np.zeros((n, c, out_h, out_w))
+    for i in range(out_h):
+        in_i = int(ratio_h * i + 0.5)
+        for j in range(out_w):
+            in_j = int(ratio_w * j + 0.5)
+            out[:, :, i, j] = X[:, :, in_i, in_j]
+
+    return out.astype(X.dtype)
+
+
+class TestNearestInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "nearest_interp"
+        input_np = np.random.random(self.input_shape).astype("float32")
+
+        output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
+                                               self.out_size, self.actual_shape)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 4, 4]
+        self.out_h = 2
+        self.out_w = 2
+        self.out_size = np.array([3, 3]).astype("int32")
+
+
+class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+
+
+class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+
+
+class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.out_size = np.array([2, 2]).astype("int32")
+
+
+class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([11, 11]).astype("int32")
+
+
+class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.out_size = np.array([66, 40]).astype("int32")
+
+
+class TestNearestInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "nearest_interp"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+        output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
+                                               self.out_size, self.actual_shape)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+
+
+class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 128, 64]
+        self.out_h = 120
+        self.out_w = 50
+
+
+class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.out_size = np.array([6, 15]).astype("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()