Merge pull request #16104 from tensor-tang/refine/jit

refine jitkernels and test

Merge pull request #16104 from tensor-tang/refine/jit
refine jitkernels and test
ccc7c358 · Tao Luo · GitHub · c49b7855 · 14d87112 · ccc7c358
45 changed file
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
    Tensor track;
    int* track_value =
        track.mutable_data<int>(emission_dims, platform::CPUPlace());
-    auto ker = jit::Get<jit::kCRFDecoding, jit::CRFDecodingTuples<T>,
+    auto ker =
-                        platform::CPUPlace>(tag_num);
+        jit::KernelFuncs<jit::CRFDecodingTuple<T>, platform::CPUPlace>::Cache()
+            .At(tag_num);
    ker(static_cast<int>(seq_len), x, w, alpha_value, track_value, tag_num);
    T max_score = -std::numeric_limits<T>::max();
    int max_i = 0;

--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -110,8 +110,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
        constexpr int simd_width = 16;
        int C = c / simd_width;
-        auto multiply = jit::Get<jit::kNCHW16CMulNC, jit::NCHW16CMulNCTuples<T>,
+        auto multiply = jit::KernelFuncs<jit::NCHW16CMulNCTuple<T>,
-                                 platform::CPUPlace>(0);
+                                         platform::CPUPlace>::Cache()
+                            .At(0);
 #pragma omp parallel for collapse(2)
        for (int ni = 0; ni < n; ni++) {
          for (int ci = 0; ci < C; ci++) {

--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -52,8 +52,9 @@ struct EmbeddingVSumFunctor {
                                  out_width, jit::SeqPoolType::kSum);
    for (size_t i = 0; i != ids_lod.size() - 1; ++i) {
      attr.index_height = ids_lod[i + 1] - ids_lod[i];
-      auto emb_seqpool = jit::Get<jit::kEmbSeqPool, jit::EmbSeqPoolTuples<T>,
+      auto emb_seqpool =
-                                  platform::CPUPlace>(attr);
+          jit::KernelFuncs<jit::EmbSeqPoolTuple<T>, platform::CPUPlace>::Cache()
+              .At(attr);
      emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width,
                  &attr);
    }
@@ -135,8 +136,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
      T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
      const T *d_output_data = d_output->data<T>();
-      auto vbroadcast = jit::Get<jit::kVBroadcast, jit::VBroadcastTuples<T>,
+      auto vbroadcast =
-                                 platform::CPUPlace>(out_width);
+          jit::KernelFuncs<jit::VBroadcastTuple<T>, platform::CPUPlace>::Cache()
+              .At(out_width);
      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
        int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
        const T *src = d_output_data + i * out_width;

--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -196,11 +196,14 @@ class FusionGRUKernel : public framework::OpKernel<T> {
      jit::to_kerneltype(ctx.Attr<std::string>("activation")));              \
  jit::gru_t one_step;                                                       \
  auto ComputeH1 =                                                           \
-      jit::Get<jit::kGRUH1, jit::GRUTuples<T>, platform::CPUPlace>(attr);      \
+      jit::KernelFuncs<jit::GRUH1Tuple<T>, platform::CPUPlace>::Cache().At(  \
+          attr);                                                             \
  auto ComputeHtPart1 =                                                      \
-      jit::Get<jit::kGRUHtPart1, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
+      jit::KernelFuncs<jit::GRUHtPart1Tuple<T>, platform::CPUPlace>::Cache() \
+          .At(attr);                                                         \
  auto ComputeHtPart2 =                                                      \
-      jit::Get<jit::kGRUHtPart2, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
+      jit::KernelFuncs<jit::GRUHtPart2Tuple<T>, platform::CPUPlace>::Cache() \
+          .At(attr);                                                         \
  const T* x_data = x->data<T>();                                            \
  const T* wx_data = wx->data<T>();                                          \
  const T* wh_data = wh->data<T>();                                          \

--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -258,9 +258,11 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  one_step.wp = wp_data;                                                       \
  one_step.checked = checked_cell_data;                                        \
  auto ComputeC1H1 =                                                           \
-      jit::Get<jit::kLSTMC1H1, jit::LSTMTuples<T>, platform::CPUPlace>(attr); \
+      jit::KernelFuncs<jit::LSTMC1H1Tuple<T>, platform::CPUPlace>::Cache().At( \
+          attr);                                                               \
  auto ComputeCtHt =                                                           \
-      jit::Get<jit::kLSTMCtHt, jit::LSTMTuples<T>, platform::CPUPlace>(attr)
+      jit::KernelFuncs<jit::LSTMCtHtTuple<T>, platform::CPUPlace>::Cache().At( \
+          attr)
 // Wh GEMM
 #define GEMM_WH_ADDON(bs, prev, out)                                           \

--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -82,9 +82,11 @@ template <typename T>
 static void fc_relu(const T* x, const T* w, const T* b, T* y,
                    const jit::matmul_attr_t& attr) {
  auto matmul =
-      jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
+      jit::KernelFuncs<jit::MatMulTuple<T>, platform::CPUPlace>::Cache().At(
+          attr);
  auto addbias_relu =
-      jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(attr.n);
+      jit::KernelFuncs<jit::VAddReluTuple<T>, platform::CPUPlace>::Cache().At(
+          attr.n);
  matmul(x, w, y, &attr);
  T* dst = y;
  for (int i = 0; i < attr.m; ++i) {

--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
@@ -98,7 +98,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
      attr.type = jit::SeqPoolType::kSqrt;
    }
    auto seqpool =
-        jit::Get<jit::kSeqPool, jit::SeqPoolTuples<T>, platform::CPUPlace>(
+        jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(
            attr);
    size_t n = ins.size();
    size_t dst_step_size = n * w;

--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -94,19 +94,23 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
    int o_numel = attr.m * attr.n;
    auto vsquare_x =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.m *
+        jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
-                                                                       attr.k);
+            attr.m * attr.k);
    auto vsquare_y =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.k *
+        jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
-                                                                       attr.n);
+            attr.k * attr.n);
    auto vsquare_xy =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel);
+        jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
+            o_numel);
    auto vsub =
-        jit::Get<jit::kVSub, jit::XYZNTuples<T>, platform::CPUPlace>(o_numel);
+        jit::KernelFuncs<jit::VSubTuple<T>, platform::CPUPlace>::Cache().At(
+            o_numel);
    auto vscal =
-        jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel);
+        jit::KernelFuncs<jit::VScalTuple<T>, platform::CPUPlace>::Cache().At(
+            o_numel);
    auto matmul =
-        jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
+        jit::KernelFuncs<jit::MatMulTuple<T>, platform::CPUPlace>::Cache().At(
+            attr);
    const T* x_data = x->data<T>();
    const T* y_data = y->data<T>();

--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n")
 file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
 file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")
-set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place)
+set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash)
 file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
 list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc)

--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -59,8 +59,6 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
      InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_());     \
  void BenchJITKernel_##name##_##dtype##_##place##_::Run()
-#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU)
 void RUN_ALL_BENCHMARK() {
  for (auto p : g_all_benchmarks) {
    if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) {
@@ -90,11 +88,11 @@ std::vector<int> TestSizes() {
  return s;
 }
-template <typename KernelTuples, typename... Args>
+template <typename KernelTuple, typename... Args>
 struct BenchFunc {
  // return this function avg time
  // TODO(TJ): clear cache every time
-  double operator()(const typename KernelTuples::func_type tgt, Args... args) {
+  double operator()(const typename KernelTuple::func_type tgt, Args... args) {
    for (int i = 0; i < FLAGS_burning; ++i) {
      tgt(args...);
    }
@@ -109,40 +107,17 @@ struct BenchFunc {
 namespace jit = paddle::operators::jit;
-template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
+template <typename KernelTuple, typename PlaceType, typename... Args>
-          typename... Args>
+void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
-void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
+  BenchFunc<KernelTuple, Args...> benchmark;
-  BenchFunc<KernelTuples, Args...> benchmark;
  std::vector<std::pair<std::string, double>> infos;
-  // test refer
+  auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
-  auto refer = jit::GetRefer<KT, KernelTuples>();
+  for (auto f : funcs) {
-  if (!refer) {
+    infos.push_back(std::make_pair(f.first, benchmark(f.second, args...)));
-    LOG(FATAL) << "Refer can not be empty!";
  }
-  infos.push_back(std::make_pair("Refer", benchmark(refer, args...)));
-  // test jitcode
-  auto jitcode = jit::GetJitCode<KT, KernelTuples, PlaceType>(attr);
-  if (jitcode) {
-    infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...)));
-  }
-  // test all impls in more
-  jit::KernelKey kkey(KT, PlaceType());
-  auto& pool = jit::KernelPool().Instance().AllKernels();
-  auto iter = pool.find(kkey);
-  if (iter != pool.end()) {
-    auto& impls = iter->second;
-    for (auto& impl : impls) {
-      auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
-      if (i && i->UseMe(attr)) {
-        auto more = i->GetFunc();
-        infos.push_back(
-            std::make_pair(i->ImplType(), benchmark(more, args...)));
-      }
-    }
-  }
  // Test result from Get function
-  auto tgt = jit::Get<KT, KernelTuples, PlaceType>(attr);
+  auto tgt = jit::KernelFuncs<KernelTuple, PlaceType>::Cache().At(attr);
  if (!tgt) {
    LOG(FATAL) << "Target can not be empty!";
  }
@@ -150,7 +125,8 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
  // print
  std::ostringstream loginfos;
-  loginfos << "Kernel Type " << jit::to_string(KT) << ": " << attr << ": ";
+  loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": "
+           << attr << ": ";
  for (auto pair : infos) {
    loginfos << pair.first << " takes " << pair.second << " us; ";
  }
@@ -159,8 +135,9 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
 using Tensor = paddle::framework::Tensor;
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchXYZNKernel() {
+void BenchKernelXYZN() {
+  using T = typename KernelTuple::data_type;
  for (int d : TestSizes()) {
    Tensor x, y, z;
    x.Resize({d});
@@ -171,16 +148,16 @@ void BenchXYZNKernel() {
    T* z_data = z.mutable_data<T>(PlaceType());
    RandomVec<T>(d, x_data);
    RandomVec<T>(d, y_data);
-    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(),
+    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), y.data<T>(), z_data,
-                                                     y.data<T>(), z_data, d);
+                                          d);
    // test inplace
-    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(), z_data,
+    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), z_data, z_data, d);
-                                                     z_data, d);
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchAXYNKernel() {
+void BenchKernelAXYN() {
+  using T = typename KernelTuple::data_type;
  for (int d : TestSizes()) {
    const T a = static_cast<T>(3);
    Tensor x, y;
@@ -189,26 +166,26 @@ void BenchAXYNKernel() {
    T* x_data = x.mutable_data<T>(PlaceType());
    T* y_data = y.mutable_data<T>(PlaceType());
    RandomVec<T>(d, x_data);
-    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), y_data,
+    BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), y_data, d);
-                                                     d);
    // test inplace
-    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), x_data,
+    BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), x_data, d);
-                                                     d);
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchXRNKernel() {
+void BenchKernelXRN() {
+  using T = typename KernelTuple::data_type;
  for (int d : TestSizes()) {
    Tensor x;
    RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType()));
    T res;
-    BenchAllImpls<KT, jit::XRNTuples<T>, PlaceType>(d, x.data<T>(), &res, d);
+    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), &res, d);
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchXYNKernel() {
+void BenchKernelXYN() {
+  using T = typename KernelTuple::data_type;
  for (int d : TestSizes()) {
    Tensor x, y;
    x.Resize({d});
@@ -216,12 +193,13 @@ void BenchXYNKernel() {
    T* x_data = x.mutable_data<T>(PlaceType());
    T* y_data = y.mutable_data<T>(PlaceType());
    RandomVec<T>(d, x_data);
-    BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data<T>(), y_data, d);
+    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), y_data, d);
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchLSTMKernel() {
+void BenchKernelLSTM() {
+  using T = typename KernelTuple::data_type;
  for (bool use_peephole : {true, false}) {
    for (int d : TestSizes()) {
      const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh,
@@ -252,13 +230,14 @@ void BenchLSTMKernel() {
        step.wp = wp_data;
        step.checked = checked_data;
      }
-      BenchAllImpls<KT, jit::LSTMTuples<T>, PlaceType>(attr, &step, &attr);
+      BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
    }
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchGRUKernel() {
+void BenchKernelGRU() {
+  using T = typename KernelTuple::data_type;
  for (int d : TestSizes()) {
    const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
    auto place = PlaceType();
@@ -275,12 +254,13 @@ void BenchGRUKernel() {
    step.gates = x_data;
    step.ht_1 = ht_1_data;
    step.ht = ht_data;
-    BenchAllImpls<KT, jit::GRUTuples<T>, PlaceType>(attr, &step, &attr);
+    BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchSeqPoolKernel() {
+void BenchKernelSeqPool() {
+  using T = typename KernelTuple::data_type;
  std::vector<jit::SeqPoolType> pool_types = {
      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
  for (auto type : pool_types) {
@@ -294,15 +274,15 @@ void BenchSeqPoolKernel() {
        RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
        const T* x_data = x.data<T>();
        T* y_data = y.mutable_data<T>(PlaceType());
-        BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
+        BenchAllImpls<KernelTuple, PlaceType>(attr, x_data, y_data, &attr);
-                                                            y_data, &attr);
      }
    }
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchEmbSeqPoolKernel() {
+void BenchKernelEmbSeqPool() {
+  using T = typename KernelTuple::data_type;
  std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum};
  int64_t tbl_h = 1e4;
  for (int tbl_w : {10, 16, 256}) {
@@ -324,16 +304,17 @@ void BenchEmbSeqPoolKernel() {
                             tbl_h - 1);
          const int64_t* idx_data = idx.data<int64_t>();
          T* o_data = out.mutable_data<T>(PlaceType());
-          BenchAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType>(
+          BenchAllImpls<KernelTuple, PlaceType>(attr, table_data, idx_data,
-              attr, table_data, idx_data, o_data, &attr);
+                                                o_data, &attr);
        }
      }
    }
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchSgdKernel() {
+void BenchKernelSgd() {
+  using T = typename KernelTuple::data_type;
  const T lr = 0.1;
  auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                  const int64_t upper) -> std::vector<int64_t> {
@@ -364,15 +345,16 @@ void BenchSgdKernel() {
        const T* grad_data = grad.data<T>();
        const int64_t* rows_data = rows.data();
        jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
-        BenchAllImpls<KT, jit::SgdTuples<T>, PlaceType>(
+        BenchAllImpls<KernelTuple, PlaceType>(attr, &lr, param_data, grad_data,
-            attr, &lr, param_data, grad_data, rows_data, param_data, &attr);
+                                              rows_data, param_data, &attr);
      }
    }
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchMatMulKernel() {
+void BenchKernelMatMul() {
+  using T = typename KernelTuple::data_type;
  for (int m : {1, 2, 3, 4}) {
    for (int n : TestSizes()) {
      for (int k : TestSizes()) {
@@ -386,15 +368,16 @@ void BenchMatMulKernel() {
        const T* b_data = b.data<T>();
        T* c_data = c.mutable_data<T>(PlaceType());
        const jit::matmul_attr_t attr{m, n, k};
-        BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(attr, a_data, b_data,
+        BenchAllImpls<KernelTuple, PlaceType>(attr, a_data, b_data, c_data,
-                                                           c_data, &attr);
+                                              &attr);
      }
    }
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchSoftmaxKernel() {
+void BenchKernelSoftmax() {
+  using T = typename KernelTuple::data_type;
  for (int bs : {1, 2, 10}) {
    for (int n : TestSizes()) {
      Tensor x, y;
@@ -403,14 +386,14 @@ void BenchSoftmaxKernel() {
      RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
      const T* x_data = x.data<T>();
      T* y_data = y.mutable_data<T>(PlaceType());
-      BenchAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType>(n, x_data, y_data, n,
+      BenchAllImpls<KernelTuple, PlaceType>(n, x_data, y_data, n, bs);
-                                                          bs);
    }
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchLayerNormKernel() {
+void BenchKernelLayerNorm() {
+  using T = typename KernelTuple::data_type;
  const T epsilon = 9.99999975e-06;
  for (int n : {1, 2, 10}) {
    for (int x_dim_0 : {1, 9, 17, 50}) {
@@ -439,16 +422,17 @@ void BenchLayerNormKernel() {
        T* var_data = var.data<T>();
        T* out_data = out.mutable_data<T>(PlaceType());
-        BenchAllImpls<KT, jit::LayerNormTuples<T>, PlaceType>(
+        BenchAllImpls<KernelTuple, PlaceType>(right, x_data, out_data,
-            right, x_data, out_data, mean_data, var_data, scale_data, bias_data,
+                                              mean_data, var_data, scale_data,
-            left, epsilon, right);
+                                              bias_data, left, epsilon, right);
      }
    }
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchCRFDecodingKernel() {
+void BenchKernelCRFDecoding() {
+  using T = typename KernelTuple::data_type;
  constexpr int state_trans_base_idx = 2;
  for (int seq_len : {1, 11, 17, 50}) {
    for (int tag_num : TestSizes()) {
@@ -468,14 +452,15 @@ void BenchCRFDecodingKernel() {
      T* alpha_data = alpha.mutable_data<T>(PlaceType());
      int* track_data = track.mutable_data<int>(PlaceType());
-      BenchAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType>(
+      BenchAllImpls<KernelTuple, PlaceType>(tag_num, seq_len, x_data, w_data,
-          tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num);
+                                            alpha_data, track_data, tag_num);
    }
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void BenchVBroadcastKernel() {
+void BenchKernelVBroadcast() {
+  using T = typename KernelTuple::data_type;
  for (int64_t w : {1, 16, 64, 100, 256}) {
    Tensor x;
    x.Resize({w});
@@ -485,78 +470,86 @@ void BenchVBroadcastKernel() {
      Tensor y;
      y.Resize({h * w});
      T* y_data = y.mutable_data<T>(PlaceType());
-      BenchAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType>(
+      BenchAllImpls<KernelTuple, PlaceType>(w, x_data, y_data,
-          w, x_data, y_data, static_cast<int64_t>(h), w);
+                                            static_cast<int64_t>(h), w);
    }
  }
 }
-using T = float;
+#define BenchKernelVMul BenchKernelXYZN
-using CPUPlace = paddle::platform::CPUPlace;
+#define BenchKernelVAdd BenchKernelXYZN
+#define BenchKernelVAddRelu BenchKernelXYZN
+#define BenchKernelVSub BenchKernelXYZN
-// xyzn
+#define BenchKernelVScal BenchKernelAXYN
-BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, CPUPlace>(); }
+#define BenchKernelVAddBias BenchKernelAXYN
-BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, CPUPlace>(); }
-// axyn
+#define BenchKernelVRelu BenchKernelXYN
-BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, CPUPlace>(); }
+#define BenchKernelVIdentity BenchKernelXYN
-BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, CPUPlace>(); }
+#define BenchKernelVSquare BenchKernelXYN
+#define BenchKernelVExp BenchKernelXYN
+#define BenchKernelVSigmoid BenchKernelXYN
+#define BenchKernelVTanh BenchKernelXYN
+#define BenchKernelVCopy BenchKernelXYN
-// xrn
+#define BenchKernelHMax BenchKernelXRN
-BENCH_FP32_CPU(kHSum) { BenchXRNKernel<jit::kHSum, T, CPUPlace>(); }
+#define BenchKernelHSum BenchKernelXRN
-BENCH_FP32_CPU(kHMax) { BenchXRNKernel<jit::kHMax, T, CPUPlace>(); }
-// xyn
+#define BenchKernelLSTMCtHt BenchKernelLSTM
-BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, CPUPlace>(); }
+#define BenchKernelLSTMC1H1 BenchKernelLSTM
-BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVCopy) { BenchXYNKernel<jit::kVCopy, T, CPUPlace>(); }
-// lstm and peephole
-BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, CPUPlace>(); }
-BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, CPUPlace>(); }
-// gru functions
-BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, CPUPlace>(); }
-BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel<jit::kGRUHtPart1, T, CPUPlace>(); }
-BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel<jit::kGRUHtPart2, T, CPUPlace>(); }
-// seq pool function
-BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, CPUPlace>(); }
-// embedding seq pool function
-BENCH_FP32_CPU(kEmbSeqPool) {
-  BenchEmbSeqPoolKernel<jit::kEmbSeqPool, T, CPUPlace>();
-}
-// sgd function
+#define BenchKernelGRUH1 BenchKernelGRU
-BENCH_FP32_CPU(kSgd) { BenchSgdKernel<jit::kSgd, T, CPUPlace>(); }
+#define BenchKernelGRUHtPart1 BenchKernelGRU
+#define BenchKernelGRUHtPart2 BenchKernelGRU
-// matmul
+using CPUPlace = paddle::platform::CPUPlace;
-BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
-// softmax
+#define BENCH_FP32_CPU(name)                                \
-BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); }
+  BENCH_JITKERNEL(name, FP32, CPU) {                        \
+    BenchKernel##name<jit::name##Tuple<float>, CPUPlace>(); \
+  }
-// layernorm
+// xyzn
-BENCH_FP32_CPU(kLayerNorm) {
+BENCH_FP32_CPU(VMul);
-  BenchLayerNormKernel<jit::kLayerNorm, T, CPUPlace>();
+BENCH_FP32_CPU(VAdd);
-}
+BENCH_FP32_CPU(VAddRelu);
+BENCH_FP32_CPU(VSub);
-// crfdecoding
+// axyn
-BENCH_FP32_CPU(kCRFDecoding) {
+BENCH_FP32_CPU(VScal);
-  BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>();
+BENCH_FP32_CPU(VAddBias);
-}
-// vbroadcast function
+// xyn
-BENCH_FP32_CPU(kVBroadcast) {
+BENCH_FP32_CPU(VRelu);
-  BenchVBroadcastKernel<jit::kVBroadcast, T, CPUPlace>();
+BENCH_FP32_CPU(VIdentity);
-}
+BENCH_FP32_CPU(VSquare);
+BENCH_FP32_CPU(VExp);
+BENCH_FP32_CPU(VSigmoid);
+BENCH_FP32_CPU(VTanh);
+BENCH_FP32_CPU(VCopy);
+// xrn
+BENCH_FP32_CPU(HMax);
+BENCH_FP32_CPU(HSum);
+// LSTM
+BENCH_FP32_CPU(LSTMCtHt);
+BENCH_FP32_CPU(LSTMC1H1);
+// GRU
+BENCH_FP32_CPU(GRUH1);
+BENCH_FP32_CPU(GRUHtPart1);
+BENCH_FP32_CPU(GRUHtPart2);
+BENCH_FP32_CPU(LayerNorm);
+BENCH_FP32_CPU(CRFDecoding);
+BENCH_FP32_CPU(SeqPool);
+BENCH_FP32_CPU(EmbSeqPool);
+BENCH_FP32_CPU(MatMul);
+BENCH_FP32_CPU(Softmax);
+BENCH_FP32_CPU(Sgd);
+BENCH_FP32_CPU(VBroadcast);
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]

--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
@@ -13,6 +13,7 @@
 * limitations under the License. */
 #include "paddle/fluid/operators/jit/gen/act.h"
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -81,7 +82,7 @@ void VActJitCode::genCode() {
 #define DECLARE_ACT_CREATOR(name)                                            \
  class name##Creator : public JitCodeCreator<int> {                         \
   public:                                                                   \
-    bool UseMe(const int& attr) const override;                              \
+    bool CanBeUsed(const int& attr) const override;                          \
    size_t CodeSize(const int& d) const override;                            \
    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
@@ -96,27 +97,27 @@ DECLARE_ACT_CREATOR(VSigmoid);
 DECLARE_ACT_CREATOR(VTanh);
 // TODO(TJ): tuning use me
-bool VReluCreator::UseMe(const int& d) const {
+bool VReluCreator::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx);
 }
-bool VSquareCreator::UseMe(const int& d) const {
+bool VSquareCreator::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx);
 }
-bool VIdentityCreator::UseMe(const int& d) const {
+bool VIdentityCreator::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx);
 }
-bool VExpCreator::UseMe(const int& d) const {
+bool VExpCreator::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx) && d < 32;
 }
-bool VSigmoidCreator::UseMe(const int& d) const {
+bool VSigmoidCreator::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx);
 }
-bool VTanhCreator::UseMe(const int& d) const {
+bool VTanhCreator::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx);
 }

--- a/paddle/fluid/operators/jit/gen/blas.cc
+++ b/paddle/fluid/operators/jit/gen/blas.cc
@@ -13,6 +13,7 @@
 * limitations under the License. */
 #include "paddle/fluid/operators/jit/gen/blas.h"
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -142,7 +143,7 @@ void NCHW16CMulNCJitCode::genCode() {
 class NCHW16CMulNCCreator : public JitCodeCreator<int> {
 public:
-  bool UseMe(const int& attr) const override {
+  bool CanBeUsed(const int& attr) const override {
    return platform::MayIUse(platform::avx512f);
  }
  size_t CodeSize(const int& d) const override { return 256 * 1024; }
@@ -154,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
 #define DECLARE_BLAS_CREATOR(name)                                           \
  class name##Creator : public JitCodeCreator<int> {                         \
   public:                                                                   \
-    bool UseMe(const int& attr) const override {                             \
+    bool CanBeUsed(const int& attr) const override {                         \
      return platform::MayIUse(platform::avx) && attr <= 1024;               \
    }                                                                        \
    size_t CodeSize(const int& d) const override {                           \

--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/operators/jit/gen/embseqpool.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include <vector>
 #include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
 #include "paddle/fluid/operators/jit/registry.h"
@@ -121,7 +122,7 @@ void EmbSeqPoolJitCode::genCode() {
 class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
 public:
-  bool UseMe(const emb_seq_pool_attr_t& attr) const override {
+  bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override {
    return platform::MayIUse(platform::avx) &&
           attr.table_width % YMM_FLOAT_BLOCK == 0;
  }

--- a/paddle/fluid/operators/jit/gen/gru.cc
+++ b/paddle/fluid/operators/jit/gen/gru.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/operators/jit/gen/gru.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -86,7 +87,7 @@ void GRUJitCode::genCode() {
  class name##Creator : public JitCodeCreator<gru_attr_t> {       \
   public:                                                        \
    /* TODO(TJ): enable more */                                   \
-    bool UseMe(const gru_attr_t& attr) const override {           \
+    bool CanBeUsed(const gru_attr_t& attr) const override {       \
      return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
    }                                                             \
    size_t CodeSize(const gru_attr_t& attr) const override {      \

--- a/paddle/fluid/operators/jit/gen/hopv.cc
+++ b/paddle/fluid/operators/jit/gen/hopv.cc
@@ -13,6 +13,7 @@
 * limitations under the License. */
 #include "paddle/fluid/operators/jit/gen/hopv.h"
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -76,7 +77,7 @@ void HOPVJitCode::genCode() {
 #define DECLARE_HOP_CREATOR(name)                                            \
  class name##Creator : public JitCodeCreator<int> {                         \
   public:                                                                   \
-    bool UseMe(const int& attr) const override {                             \
+    bool CanBeUsed(const int& attr) const override {                         \
      return platform::MayIUse(platform::avx);                               \
    }                                                                        \
    size_t CodeSize(const int& d) const override {                           \

--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -73,7 +73,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
  virtual void genCode() = 0;
  size_t getSize() const override { return CodeGenerator::getSize(); }
-  const unsigned char* getCodeInternal() override {
+  const unsigned char* getCodeInternal() const override {
    const Xbyak::uint8* code = CodeGenerator::getCode();
    return code;
  }

--- a/paddle/fluid/operators/jit/gen/lstm.cc
+++ b/paddle/fluid/operators/jit/gen/lstm.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/operators/jit/gen/lstm.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -114,7 +115,7 @@ void LSTMJitCode::genCode() {
  class name##Creator : public JitCodeCreator<lstm_attr_t> {      \
   public:                                                        \
    /* TODO(TJ): enable more */                                   \
-    bool UseMe(const lstm_attr_t& attr) const override {          \
+    bool CanBeUsed(const lstm_attr_t& attr) const override {      \
      return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
    }                                                             \
    size_t CodeSize(const lstm_attr_t& attr) const override {     \

--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -14,8 +14,8 @@
 #include "paddle/fluid/operators/jit/gen/matmul.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include <vector>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -98,7 +98,7 @@ void MatMulJitCode::genCode() {
 class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
 public:
-  bool UseMe(const matmul_attr_t& attr) const override {
+  bool CanBeUsed(const matmul_attr_t& attr) const override {
    return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
           attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
  }

--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@@ -13,6 +13,7 @@
 * limitations under the License. */
 #include "paddle/fluid/operators/jit/gen/seqpool.h"
+#include <memory>
 #include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -57,7 +58,7 @@ void SeqPoolJitCode::genCode() {
 class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
 public:
-  bool UseMe(const seq_pool_attr_t& attr) const override {
+  bool CanBeUsed(const seq_pool_attr_t& attr) const override {
    return platform::MayIUse(platform::avx);
  }
  size_t CodeSize(const seq_pool_attr_t& attr) const override {

--- a/paddle/fluid/operators/jit/gen/sgd.cc
+++ b/paddle/fluid/operators/jit/gen/sgd.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/operators/jit/gen/sgd.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include <vector>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -104,7 +105,7 @@ void SgdJitCode::genCode() {
 class SgdCreator : public JitCodeCreator<sgd_attr_t> {
 public:
-  bool UseMe(const sgd_attr_t& attr) const override {
+  bool CanBeUsed(const sgd_attr_t& attr) const override {
    return platform::MayIUse(platform::avx) &&
           attr.grad_width % YMM_FLOAT_BLOCK == 0;
  }

--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
@@ -69,7 +69,7 @@ void VBroadcastJitCode::genCode() {
 class VBroadcastCreator : public JitCodeCreator<int64_t> {
 public:
-  bool UseMe(const int64_t& w) const override {
+  bool CanBeUsed(const int64_t& w) const override {
    return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
  }
  size_t CodeSize(const int64_t& w) const override {

--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -31,7 +31,7 @@ namespace paddle {
 namespace operators {
 namespace jit {
-// refer do not need useme, it would be the last one.
+// refer do not need CanBeUsed, it would be the last one.
 void GenBase::dumpCode(const unsigned char* code) const {
  if (code) {
    static int counter = 0;

--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -31,9 +31,10 @@ class GenBase : public Kernel {
  virtual ~GenBase() = default;
  virtual std::string name() const = 0;
  virtual size_t getSize() const = 0;
-  virtual const unsigned char* getCodeInternal() = 0;
+  virtual const unsigned char* getCodeInternal() const = 0;
+  const char* ImplType() const override { return "JitCode"; }
  template <typename Func>
-  Func getCode() {
+  Func getCode() const {
    const unsigned char* code = this->getCodeInternal();
    if (FLAGS_dump_jitcode) {
      this->dumpCode(code);
@@ -65,7 +66,7 @@ class JitCodeCreator : public GenCreator {
  virtual ~JitCodeCreator() = default;
  // condition when this jit code can be used.
-  virtual bool UseMe(const Attr& attr) const = 0;
+  virtual bool CanBeUsed(const Attr& attr) const = 0;
  // estimate this code size
  virtual size_t CodeSize(const Attr& attr) const = 0;

--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -16,6 +16,8 @@
 #include <iostream>
 #include <string>
+#include <unordered_map>
+#include <utility>  // for std::move
 #include <vector>
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
@@ -27,35 +29,34 @@ namespace paddle {
 namespace operators {
 namespace jit {
-template <KernelType KT, typename KernelTuples, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
 inline typename std::enable_if<
-    std::is_same<typename KernelTuples::data_type, float>::value &&
+    std::is_same<typename KernelTuple::data_type, float>::value &&
        std::is_same<PlaceType, platform::CPUPlace>::value,
-    typename KernelTuples::func_type>::type
+    const Kernel*>::type
-GetJitCode(const typename KernelTuples::attr_type& attr) {
+GetJitCode(const typename KernelTuple::attr_type& attr) {
-  using Func = typename KernelTuples::func_type;
+  using Attr = typename KernelTuple::attr_type;
-  using Attr = typename KernelTuples::attr_type;
+  int64_t key = JitCodeKey<Attr>(attr);
-  size_t key = JitCodeKey<Attr>(attr);
+  auto& codes = JitCodePool<KernelTuple::kernel_type>::Instance();
-  auto& codes = JitCodePool<KT>().Instance();
  if (codes.Has(key)) {
-    return codes.AllKernels().at(key)->template getCode<Func>();
+    return codes.AllKernels().at(key).get();
  }
  // creator is not related with attr, so can use KernelKey as key
-  KernelKey kkey(KT, PlaceType());
+  KernelKey kkey(KernelTuple::kernel_type, PlaceType());
  // pool: (KernelKey(type, place), vector<GenCreatorPtr>)
-  auto& creator_map = JitCodeCreatorPool().Instance().AllCreators();
+  auto& creator_map = JitCodeCreatorPool::Instance().AllCreators();
  auto iter = creator_map.find(kkey);
  if (iter != creator_map.end()) {
    auto& creators = iter->second;
    for (auto& cur : creators) {
      auto i = dynamic_cast<const JitCodeCreator<Attr>*>(cur.get());
-      if (i && i->UseMe(attr)) {
+      if (i && i->CanBeUsed(attr)) {
        auto p = i->CreateJitCode(attr);
        if (p) {
-          auto f = p->template getCode<Func>();
+          auto res = p.get();
          codes.Insert(key, std::move(p));
-          return f;
+          return res;
        }
      }
    }
@@ -63,87 +64,153 @@ GetJitCode(const typename KernelTuples::attr_type& attr) {
  return nullptr;
 }
-template <KernelType KT, typename KernelTuples, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
 inline typename std::enable_if<
-    !std::is_same<typename KernelTuples::data_type, float>::value ||
+    !std::is_same<typename KernelTuple::data_type, float>::value ||
        !std::is_same<PlaceType, platform::CPUPlace>::value,
-    typename KernelTuples::func_type>::type
+    const Kernel*>::type
-GetJitCode(const typename KernelTuples::attr_type& attr) {
+GetJitCode(const typename KernelTuple::attr_type& attr) {
  return nullptr;
 }
 // Refer code do not related with attr, which is just for cast
 // Refer is always on CPUPlace
-template <KernelType KT, typename KernelTuples>
+template <typename KernelTuple>
-inline typename KernelTuples::func_type GetRefer() {
+inline const Kernel* GetReferKernel() {
-  auto& ref_pool = ReferKernelPool().Instance().AllKernels();
+  auto& ref_pool = ReferKernelPool::Instance().AllKernels();
-  KernelKey kkey(KT, platform::CPUPlace());
+  KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace());
  auto ref_iter = ref_pool.find(kkey);
  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
                 "Every Kernel should have reference function.");
  auto& ref_impls = ref_iter->second;
  for (auto& impl : ref_impls) {
-    auto i = dynamic_cast<const ReferKernel<KernelTuples>*>(impl.get());
+    auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
    if (i) {
-      return i->GetFunc();
+      return i;
    }
  }
  return nullptr;
 }
-template <KernelType KT, typename KernelTuples,
+template <typename KernelTuple>
-          typename PlaceType = platform::CPUPlace>
+inline typename KernelTuple::func_type GetReferFunc() {
-typename KernelTuples::func_type Get(
+  auto ker = GetReferKernel<KernelTuple>();
-    const typename KernelTuples::attr_type& attr) {
+  auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
-  auto jitfunc = GetJitCode<KT, KernelTuples, PlaceType>(attr);
+  PADDLE_ENFORCE(p, "The Refer kernel should exsit");
-  if (jitfunc) {
+  return p->GetFunc();
-    return jitfunc;
+}
+// Return all Kernels that can be used
+template <typename KernelTuple, typename PlaceType>
+std::vector<const Kernel*> GetAllCandidateKernels(
+    const typename KernelTuple::attr_type& attr) {
+  // the search order shoudl be jitcode > more > refer
+  std::vector<const Kernel*> res;
+  auto jitker = GetJitCode<KernelTuple, PlaceType>(attr);
+  if (jitker) {
+    res.emplace_back(jitker);
  }
-  // pool: (KernelKey(type, place), vector<KernelPtr>)
+  // more kernelpool: (KernelKey(type, place), vector<KernelPtr>)
-  KernelKey kkey(KT, PlaceType());
+  KernelKey kkey(KernelTuple::kernel_type, PlaceType());
-  auto& pool = KernelPool().Instance().AllKernels();
+  auto& pool = KernelPool::Instance().AllKernels();
  auto iter = pool.find(kkey);
  if (iter != pool.end()) {
    auto& impls = iter->second;
    for (auto& impl : impls) {
-      auto i = dynamic_cast<const KernelMore<KernelTuples>*>(impl.get());
+      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(impl.get());
-      if (i && i->UseMe(attr)) {
+      if (i && i->CanBeUsed(attr)) {
-        return i->GetFunc();
+        res.emplace_back(i);
      }
    }
  }
  // The last implementation should be reference function on CPUPlace.
-  return GetRefer<KT, KernelTuples>();
+  auto ref = GetReferKernel<KernelTuple>();
+  PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
+  res.emplace_back(ref);
+  return res;
+}
+template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
+std::vector<std::pair<std::string, typename KernelTuple::func_type>>
+GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
+  using Func = typename KernelTuple::func_type;
+  auto kers = GetAllCandidateKernels<KernelTuple, PlaceType>(attr);
+  std::vector<std::pair<std::string, Func>> res;
+  for (auto k : kers) {
+    std::string name = k->ImplType();
+    if (name == "JitCode") {
+      auto i = dynamic_cast<const GenBase*>(k);
+      PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
+      res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
+    } else {
+      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
+      PADDLE_ENFORCE(i, "kernel cast can not fail.");
+      res.emplace_back(std::make_pair(name, i->GetFunc()));
+    }
+  }
+  return res;
+}
+template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
+std::vector<typename KernelTuple::func_type> GetAllCandidateFuncs(
+    const typename KernelTuple::attr_type& attr) {
+  auto funcs = GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
+  std::vector<typename KernelTuple::func_type> res;
+  for (auto& i : funcs) {
+    res.emplace_back(i.second);
+  }
+  return res;
+}
+template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
+typename KernelTuple::func_type GetDefaultBestFunc(
+    const typename KernelTuple::attr_type& attr) {
+  auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
+  PADDLE_ENFORCE_GE(funcs.size(), 1UL);
+  // Here could do some runtime benchmark of this attr and return the best one.
+  // But yet just get the first one as the default best one,
+  // which is searched in order and tuned by offline.
+  return funcs[0];
 }
-template <KernelType KT, typename KernelTuples, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
 class KernelFuncs {
 public:
  KernelFuncs() = default;
  static KernelFuncs& Cache() {
-    static thread_local KernelFuncs<KT, KernelTuples, PlaceType> g_func_cache;
+    static thread_local KernelFuncs<KernelTuple, PlaceType> g_func_cache;
    return g_func_cache;
  }
-  bool Has(int key) const { return funcs_.find(key) != funcs_.end(); }
+  // the exposed interface to use
+  typename KernelTuple::func_type At(
-  void Insert(int key, typename KernelTuples::func_type func) {
+      const typename KernelTuple::attr_type& attr) {
-    funcs_.emplace(key, func);
+    // Maybe here is not good enough, not all kernels should have jitcode
-  }
+    int64_t key = JitCodeKey<typename KernelTuple::attr_type>(attr);
-  typename KernelTuples::func_type At(int key) {
    if (Has(key)) {
      return funcs_.at(key);
    }
-    auto func = Get<KT, KernelTuples, PlaceType>(key);
+    // If do not have this attr in cache then get the default best
+    auto func = GetDefaultBestFunc<KernelTuple, PlaceType>(attr);
    Insert(key, func);
    return func;
  }
+  typename KernelTuple::func_type operator[](
+      const typename KernelTuple::attr_type& attr) {
+    return At(attr);
+  }
+ protected:
+  bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); }
+  void Insert(int64_t key, typename KernelTuple::func_type func) {
+    funcs_.emplace(key, func);
+  }
 private:
-  std::unordered_map<int, typename KernelTuples::func_type> funcs_;
+  std::unordered_map<int64_t, typename KernelTuple::func_type> funcs_;
  DISABLE_COPY_AND_ASSIGN(KernelFuncs);
 };

--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -62,26 +62,55 @@ typedef enum {
  kSqrt,
 } SeqPoolType;
+// x, y, z, n
 template <typename T>
-struct XYZNTuples {
+struct XYZNTuple {
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(const T*, const T*, T*, int);
 };
+// a, x, y, n
 template <typename T>
-struct AXYNTuples : public XYZNTuples<T> {};
+struct AXYNTuple : public XYZNTuple<T> {};
+// x, y, n
 template <typename T>
-struct XYNTuples {
+struct XYNTuple {
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(const T*, T*, int);
 };
-// x, return and int
+// x, returned value, n
 template <typename T>
-struct XRNTuples : public XYNTuples<T> {};
+struct XRNTuple : public XYNTuple<T> {};
+#define DECLARE_KERNELTUPLE(kernel_tuple, type)        \
+  template <typename T>                                \
+  struct type##Tuple : public kernel_tuple<T> {        \
+    static constexpr KernelType kernel_type = k##type; \
+  }
+// Tuple should be corresponding to the KernelType
+DECLARE_KERNELTUPLE(XYZNTuple, VMul);
+DECLARE_KERNELTUPLE(XYZNTuple, VAdd);
+DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu);
+DECLARE_KERNELTUPLE(XYZNTuple, VSub);
+DECLARE_KERNELTUPLE(AXYNTuple, VScal);
+DECLARE_KERNELTUPLE(AXYNTuple, VAddBias);
+DECLARE_KERNELTUPLE(XYNTuple, VRelu);
+DECLARE_KERNELTUPLE(XYNTuple, VIdentity);
+DECLARE_KERNELTUPLE(XYNTuple, VSquare);
+DECLARE_KERNELTUPLE(XYNTuple, VExp);
+DECLARE_KERNELTUPLE(XYNTuple, VSigmoid);
+DECLARE_KERNELTUPLE(XYNTuple, VTanh);
+DECLARE_KERNELTUPLE(XYNTuple, VCopy);
+DECLARE_KERNELTUPLE(XRNTuple, HMax);
+DECLARE_KERNELTUPLE(XRNTuple, HSum);
 typedef struct {
  void* gates;  // gates: x_ch, x_ih, x_fh, x_oh
@@ -122,21 +151,31 @@ typedef struct rnn_attr_s gru_attr_t;
 typedef struct lstm_attr_s lstm_attr_t;
 template <typename T>
-struct LSTMTuples {
+struct LSTMTuple {
  typedef T data_type;
  typedef lstm_attr_t attr_type;
  typedef void (*func_type)(lstm_t*, const lstm_attr_t*);
 };
 template <typename T>
-struct GRUTuples {
+struct GRUTuple {
  typedef T data_type;
  typedef gru_attr_t attr_type;
  typedef void (*func_type)(gru_t*, const gru_attr_t*);
 };
+DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt);
+DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1);
+DECLARE_KERNELTUPLE(GRUTuple, GRUH1);
+DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1);
+DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2);
+#undef DECLARE_KERNELTUPLE
 template <typename T>
-struct VBroadcastTuples {
+struct VBroadcastTuple {
+  static constexpr KernelType kernel_type = kVBroadcast;
  typedef T data_type;
  typedef int64_t attr_type;
  typedef void (*func_type)(const T*, T*, int64_t, int64_t);
@@ -151,7 +190,8 @@ typedef struct seq_pool_attr_s {
 } seq_pool_attr_t;
 template <typename T>
-struct SeqPoolTuples {
+struct SeqPoolTuple {
+  static constexpr KernelType kernel_type = kSeqPool;
  typedef T data_type;
  typedef seq_pool_attr_t attr_type;
  typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
@@ -176,7 +216,8 @@ typedef struct emb_seq_pool_attr_s {
 } emb_seq_pool_attr_t;
 template <typename T>
-struct EmbSeqPoolTuples {
+struct EmbSeqPoolTuple {
+  static constexpr KernelType kernel_type = kEmbSeqPool;
  typedef T data_type;
  typedef emb_seq_pool_attr_t attr_type;
  typedef void (*func_type)(const T*, const int64_t*, T*,
@@ -198,7 +239,8 @@ typedef struct sgd_attr_s {
 } sgd_attr_t;
 template <typename T>
-struct SgdTuples {
+struct SgdTuple {
+  static constexpr KernelType kernel_type = kSgd;
  typedef T data_type;
  typedef sgd_attr_t attr_type;
  typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*,
@@ -214,21 +256,24 @@ typedef struct matmul_attr_s {
 } matmul_attr_t;
 template <typename T>
-struct MatMulTuples {
+struct MatMulTuple {
+  static constexpr KernelType kernel_type = kMatMul;
  typedef T data_type;
  typedef matmul_attr_t attr_type;
  typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
 };
 template <typename T>
-struct CRFDecodingTuples {
+struct CRFDecodingTuple {
+  static constexpr KernelType kernel_type = kCRFDecoding;
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(const int, const T*, const T*, T*, int*, int);
 };
 template <typename T>
-struct LayerNormTuples {
+struct LayerNormTuple {
+  static constexpr KernelType kernel_type = kLayerNorm;
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int,
@@ -236,7 +281,8 @@ struct LayerNormTuples {
 };
 template <typename T>
-struct SoftmaxTuples {
+struct SoftmaxTuple {
+  static constexpr KernelType kernel_type = kSoftmax;
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(const T*, T*, int, int);
@@ -244,7 +290,8 @@ struct SoftmaxTuples {
 // nChw16c = nChw16c .* NC
 template <typename T>
-struct NCHW16CMulNCTuples {
+struct NCHW16CMulNCTuple {
+  static constexpr KernelType kernel_type = kNCHW16CMulNC;
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(const T*, const T*, T*, int, int);
@@ -255,28 +302,29 @@ class Kernel {
 public:
  Kernel() = default;
  virtual ~Kernel() = default;
+  virtual const char* ImplType() const = 0;
  DISABLE_COPY_AND_ASSIGN(Kernel);
 };
-template <typename KernelTuples>
+template <typename KernelTuple>
 class KernelMore : public Kernel {
 public:
-  using T = typename KernelTuples::data_type;
+  using T = typename KernelTuple::data_type;
-  using Func = typename KernelTuples::func_type;
+  using Func = typename KernelTuple::func_type;
-  using Attr = typename KernelTuples::attr_type;
+  using Attr = typename KernelTuple::attr_type;
  virtual Func GetFunc() const { return func; }
-  virtual bool UseMe(const Attr& attr) const = 0;
+  // specify this kernel can be used, means it should not fail if use it.
-  virtual const char* ImplType() const = 0;
+  virtual bool CanBeUsed(const Attr& attr) const = 0;
 protected:
  Func func{nullptr};
 };
-template <typename KernelTuples>
+template <typename KernelTuple>
-class ReferKernel : public KernelMore<KernelTuples> {
+class ReferKernel : public KernelMore<KernelTuple> {
 public:
  // Refer code can always be used
-  bool UseMe(const typename KernelTuples::attr_type& attr) const override {
+  bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override {
    return true;
  }
  const char* ImplType() const override { return "Refer"; }

--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -13,6 +13,7 @@
 * limitations under the License. */
 #include "paddle/fluid/operators/jit/kernel_key.h"
+#include <xxhash.h>  // XXH64: 13.8 GB/s
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
@@ -20,71 +21,46 @@ namespace operators {
 namespace jit {
 template <>
-size_t JitCodeKey<int>(const int& d) {
+int64_t JitCodeKey<int>(const int& d) {
  return d;
 }
 template <>
-size_t JitCodeKey<int64_t>(const int64_t& d) {
+int64_t JitCodeKey<int64_t>(const int64_t& d) {
  return d;
 }
-// TODO(TJ): refine and benchmark JitCodeKey generatation
-constexpr int act_type_shift = 3;  // suppot 2^3 act types
-static inline int act_type_convert(KernelType type) {
-  if (type == kVIdentity) {
-    return 0;
-  } else if (type == kVExp) {
-    return 1;
-  } else if (type == kVRelu) {
-    return 2;
-  } else if (type == kVSigmoid) {
-    return 3;
-  } else if (type == kVTanh) {
-    return 4;
-  }
-  PADDLE_THROW("Unsupported act type %d", type);
-  return 0;
-}
 template <>
-size_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
+int64_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
-  size_t key = attr.d;
+  return XXH64(&attr, sizeof(gru_attr_t), 0);
-  int gate_key = act_type_convert(attr.act_gate) << 1;
-  int cand_key = act_type_convert(attr.act_cand) << (1 + act_type_shift);
-  int cell_key = act_type_convert(attr.act_cell) << (1 + act_type_shift * 2);
-  return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key +
-         attr.use_peephole;
 }
 template <>
-size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
+int64_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
-  size_t key = attr.d;
+  int keys[5] = {
-  return (key << (act_type_shift * 2)) + act_type_convert(attr.act_gate) +
+      attr.d, static_cast<int>(attr.act_gate), static_cast<int>(attr.act_cand),
-         (act_type_convert(attr.act_cand) << act_type_shift);
+      static_cast<int>(attr.act_cell), static_cast<int>(attr.use_peephole)};
+  return XXH64(keys, sizeof(int) * 5, 0);
 }
 template <>
-size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
+int64_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
-  size_t key = attr.w;
+  int keys[2] = {attr.w, static_cast<int>(attr.type)};
-  constexpr int pool_type_shift = 3;
+  return XXH64(keys, sizeof(int) * 2, 0);
-  return (key << pool_type_shift) + static_cast<int>(attr.type);
 }
 template <>
-size_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
+int64_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
-  size_t key = attr.m;
+  return XXH64(&attr, sizeof(int) * 3, 0);  // m, n, k
-  constexpr int shift = 21;
-  return (key << shift * 2) + ((static_cast<size_t>(attr.n)) << shift) + attr.k;
 }
 template <>
-size_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
+int64_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
  return attr.table_width;
 }
 template <>
-size_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) {
+int64_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) {
  return attr.grad_width;
 }

--- a/paddle/fluid/operators/jit/kernel_key.h
+++ b/paddle/fluid/operators/jit/kernel_key.h
@@ -46,7 +46,7 @@ struct KernelKey {
 // Every JitCode should have a method to get the key from attribution
 template <typename Attr>
-size_t JitCodeKey(const Attr& attr);
+int64_t JitCodeKey(const Attr& attr);
 }  // namespace jit
 }  // namespace operators

--- a/paddle/fluid/operators/jit/kernel_pool.h
+++ b/paddle/fluid/operators/jit/kernel_pool.h
@@ -17,6 +17,7 @@
 #include <memory>  // for unique_ptr
 #include <string>
 #include <unordered_map>
+#include <utility>  // for move
 #include <vector>
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
@@ -30,7 +31,7 @@ namespace jit {
 template <KernelType KT>
 class JitCodePool {
  typedef std::unique_ptr<GenBase> GenBasePtr;
-  typedef std::unordered_map<size_t, GenBasePtr> JitCodeMap;
+  typedef std::unordered_map<int64_t, GenBasePtr> JitCodeMap;
 public:
  JitCodePool() = default;
@@ -41,9 +42,9 @@ class JitCodePool {
  const JitCodeMap& AllKernels() { return codes_; }
-  bool Has(size_t key) const { return codes_.find(key) != codes_.end(); }
+  bool Has(int64_t key) const { return codes_.find(key) != codes_.end(); }
-  void Insert(size_t key, GenBasePtr value) {
+  void Insert(int64_t key, GenBasePtr value) {
    codes_.emplace(key, std::move(value));
  }

--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
@@ -161,7 +161,7 @@ void CRFDecoding(const int seq_len, const float* x, const float* w,
  }
 }
-bool CRFDecodingKernel::UseMe(const int& d) const {
+bool CRFDecodingKernel::CanBeUsed(const int& d) const {
 #ifdef __AVX512F__
  constexpr int block = ZMM_FLOAT_BLOCK;
 #else

--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
@@ -26,11 +26,11 @@ namespace intrinsic {
 void CRFDecoding(const int seq_len, const float* x, const float* w,
                 float* alpha, int* track, int tag_num);
-class CRFDecodingKernel : public KernelMore<CRFDecodingTuples<float>> {
+class CRFDecodingKernel : public KernelMore<CRFDecodingTuple<float>> {
 public:
  CRFDecodingKernel() { this->func = CRFDecoding; }
-  bool UseMe(
+  bool CanBeUsed(
-      const typename CRFDecodingTuples<float>::attr_type&) const override;
+      const typename CRFDecodingTuple<float>::attr_type&) const override;
  const char* ImplType() const override { return "Intrinsic"; }
 };

--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
@@ -153,7 +153,7 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
  }
 }
-bool LayerNormKernel::UseMe(const int& d) const {
+bool LayerNormKernel::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK;
 }

--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
@@ -27,10 +27,11 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
               const float* scale, const float* bias, int height,
               const float epsilon, int right);
-class LayerNormKernel : public KernelMore<LayerNormTuples<float>> {
+class LayerNormKernel : public KernelMore<LayerNormTuple<float>> {
 public:
  LayerNormKernel() { this->func = LayerNorm; }
-  bool UseMe(const typename LayerNormTuples<float>::attr_type&) const override;
+  bool CanBeUsed(
+      const typename LayerNormTuple<float>::attr_type&) const override;
  const char* ImplType() const override { return "Intrinsic"; }
 };

--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -23,6 +23,8 @@ namespace jit {
 namespace more {
 namespace mix {
+using CPUPlace = platform::CPUPlace;
 void VSigmoid(const T* x, T* y, int n) {
  const float min = SIGMOID_THRESHOLD_MIN;
  const float max = SIGMOID_THRESHOLD_MAX;
@@ -30,7 +32,7 @@ void VSigmoid(const T* x, T* y, int n) {
    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
    y[i] = static_cast<T>(0) - y[i];
  }
-  auto compute = Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n);
+  auto compute = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
  compute(y, y, n);
  for (int i = 0; i < n; ++i) {
    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
@@ -39,9 +41,9 @@ void VSigmoid(const T* x, T* y, int n) {
 void VTanh(const T* x, T* y, int n) {
  const T a = 2, b = -1;
-  auto compute_scal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n);
+  auto compute_scal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_addbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n);
+  auto compute_addbias = KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_sigmoid = Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(n);
+  auto compute_sigmoid = KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(n);
  compute_scal(&a, x, y, n);
  compute_sigmoid(y, y, n);
  compute_scal(&a, y, y, n);
@@ -49,16 +51,12 @@ void VTanh(const T* x, T* y, int n) {
 }
 void Softmax(const T* x, T* y, int n, int bs) {
-  auto compute_hmax =
+  auto compute_hmax = KernelFuncs<HMaxTuple<T>, CPUPlace>::Cache().At(n);
-      KernelFuncs<kHMax, XRNTuples<T>, platform::CPUPlace>::Cache().At(n);
+  auto compute_hsum = KernelFuncs<HSumTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_hsum =
+  auto compute_vscal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
-      KernelFuncs<kHSum, XRNTuples<T>, platform::CPUPlace>::Cache().At(n);
-  auto compute_vscal =
-      KernelFuncs<kVScal, AXYNTuples<T>, platform::CPUPlace>::Cache().At(n);
  auto compute_vaddbias =
-      KernelFuncs<kVAddBias, AXYNTuples<T>, platform::CPUPlace>::Cache().At(n);
+      KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_vexp =
+  auto compute_vexp = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
-      KernelFuncs<kVExp, XYNTuples<T>, platform::CPUPlace>::Cache().At(n);
  for (int i = 0; i < bs; ++i) {
    T scalar;
@@ -76,13 +74,13 @@ void Softmax(const T* x, T* y, int n, int bs) {
 void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
  if (type == kVSigmoid) {
-    return Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(d);
+    return KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(d);
  } else if (type == kVRelu) {
-    return Get<kVRelu, XYNTuples<T>, platform::CPUPlace>(d);
+    return KernelFuncs<VReluTuple<T>, CPUPlace>::Cache().At(d);
  } else if (type == kVTanh) {
-    return Get<kVTanh, XYNTuples<T>, platform::CPUPlace>(d);
+    return KernelFuncs<VTanhTuple<T>, CPUPlace>::Cache().At(d);
  } else if (type == kVIdentity) {
-    return Get<kVIdentity, XYNTuples<T>, platform::CPUPlace>(d);
+    return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
  }
  PADDLE_THROW("Not support type: %s", type);
  return nullptr;
@@ -98,9 +96,9 @@ void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
  const int d = attr->d;
  const int d2 = d * 2;
  const int d3 = d * 3;
-  auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(d);
+  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
-  auto vadd_d = Get<kVAdd, XYZNTuples<T>, platform::CPUPlace>(d);
+  auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
-  auto vadd_d2 = Get<kVAdd, XYZNTuples<T>, platform::CPUPlace>(d2);
+  auto vadd_d2 = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d2);
  auto act_gate_d = getActFunc(attr->act_gate, d);
  auto act_gate_d2 = getActFunc(attr->act_gate, d2);
  auto act_gate_d3 = getActFunc(attr->act_gate, d3);
@@ -140,8 +138,8 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
  int d = attr->d;
  int d2 = d * 2;
  int d3 = d * 3;
-  auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(d);
+  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
-  auto vadd_d = Get<kVAdd, XYZNTuples<T>, platform::CPUPlace>(d);
+  auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
  auto act_gate_d = getActFunc(attr->act_gate, d);
  auto act_cand_d = getActFunc(attr->act_cand, d);
  auto act_cell_d = getActFunc(attr->act_cell, d);
@@ -169,7 +167,7 @@ void GRUH1(gru_t* step, const gru_attr_t* attr) {
  int d2 = d * 2;
  auto act_gate = getActFunc(attr->act_gate, d);
  auto act_cand = getActFunc(attr->act_cand, d);
-  auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(d);
+  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
  act_gate(gates, gates, d);
  act_cand(gates + d2, gates + d2, d);
  vmul_d(gates, gates + d2, ht, d);
@@ -182,7 +180,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
  T* ht = reinterpret_cast<T*>(step->ht);
  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
  auto act_gate = getActFunc(attr->act_gate, attr->d);
-  auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(attr->d);
+  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(attr->d);
  act_gate(gates + attr->d, gates + attr->d, attr->d);
  vmul_d(ht_1, gates + attr->d, ht, attr->d);
 }
@@ -206,21 +204,21 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
 }
 // TODO(TJ): tuning me
-bool VSigmoidKernel::UseMe(const int& d) const { return true; }
+bool VSigmoidKernel::CanBeUsed(const int& d) const { return true; }
-bool VTanhKernel::UseMe(const int& d) const { return true; }
+bool VTanhKernel::CanBeUsed(const int& d) const { return true; }
-bool SoftmaxKernel::UseMe(const int& d) const { return true; }
+bool SoftmaxKernel::CanBeUsed(const int& d) const { return true; }
-bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; }
+bool LSTMCtHtKernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
-bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; }
+bool LSTMC1H1Kernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
-bool GRUH1Kernel::UseMe(const gru_attr_t& attr) const { return true; }
+bool GRUH1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
-bool GRUHtPart1Kernel::UseMe(const gru_attr_t& attr) const { return true; }
+bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
-bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; }
+bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
 }  // namespace mix
 }  // namespace more
@@ -230,16 +228,16 @@ bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; }
 namespace mix = paddle::operators::jit::more::mix;
-#define REGISTER_MORE_KERNEL(key, func) \
+#define REGISTER_MORE_KERNEL(func) \
-  REGISTER_JITKERNEL_MORE(key, mix, mix::func##Kernel)
+  REGISTER_JITKERNEL_MORE(k##func, mix, mix::func##Kernel)
-REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid);
+REGISTER_MORE_KERNEL(VSigmoid);
-REGISTER_MORE_KERNEL(kVTanh, VTanh);
+REGISTER_MORE_KERNEL(VTanh);
-REGISTER_MORE_KERNEL(kSoftmax, Softmax);
+REGISTER_MORE_KERNEL(Softmax);
-REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt);
+REGISTER_MORE_KERNEL(LSTMCtHt);
-REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1);
+REGISTER_MORE_KERNEL(LSTMC1H1);
-REGISTER_MORE_KERNEL(kGRUH1, GRUH1);
+REGISTER_MORE_KERNEL(GRUH1);
-REGISTER_MORE_KERNEL(kGRUHtPart1, GRUHtPart1);
+REGISTER_MORE_KERNEL(GRUHtPart1);
-REGISTER_MORE_KERNEL(kGRUHtPart2, GRUHtPart2);
+REGISTER_MORE_KERNEL(GRUHtPart2);
 #undef REGISTER_MORE_KERNEL
--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
@@ -34,27 +34,27 @@ void GRUH1(gru_t* step, const gru_attr_t* attr);
 void GRUHtPart1(gru_t* step, const gru_attr_t* attr);
 void GRUHtPart2(gru_t* step, const gru_attr_t* attr);
-#define DECLARE_MORE_KERNEL(name, tuples)                            \
+#define DECLARE_MORE_KERNEL(name)                                             \
-  class name##Kernel : public KernelMore<tuples<T>> {                \
+  class name##Kernel : public KernelMore<name##Tuple<T>> {                    \
   public:                                                                    \
    name##Kernel() { this->func = name; }                                     \
-    bool UseMe(const typename tuples<T>::attr_type&) const override; \
+    bool CanBeUsed(const typename name##Tuple<T>::attr_type&) const override; \
    const char* ImplType() const override { return "Mixed"; }                 \
  }
 // XYN
-DECLARE_MORE_KERNEL(VSigmoid, XYNTuples);
+DECLARE_MORE_KERNEL(VSigmoid);
-DECLARE_MORE_KERNEL(VTanh, XYNTuples);
+DECLARE_MORE_KERNEL(VTanh);
 // XRN
-DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples);
+DECLARE_MORE_KERNEL(Softmax);
-DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples);
+DECLARE_MORE_KERNEL(LSTMCtHt);
-DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples);
+DECLARE_MORE_KERNEL(LSTMC1H1);
-DECLARE_MORE_KERNEL(GRUH1, GRUTuples);
+DECLARE_MORE_KERNEL(GRUH1);
-DECLARE_MORE_KERNEL(GRUHtPart1, GRUTuples);
+DECLARE_MORE_KERNEL(GRUHtPart1);
-DECLARE_MORE_KERNEL(GRUHtPart2, GRUTuples);
+DECLARE_MORE_KERNEL(GRUHtPart2);
 #undef DECLARE_MORE_KERNEL

--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -130,104 +130,105 @@ void ASum<double>(const double* x, double* res, int n) {
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
 template <>
-bool VMulKernel<float>::UseMe(const int& d) const {
+bool VMulKernel<float>::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx512f) && d > 512;
 }
 template <>
-bool VAddKernel<float>::UseMe(const int& d) const {
+bool VAddKernel<float>::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx) && d > 512;
 }
 template <>
-bool VScalKernel<float>::UseMe(const int& d) const {
+bool VScalKernel<float>::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx512f) && d > 512;
 }
 template <>
-bool VExpKernel<float>::UseMe(const int& d) const {
+bool VExpKernel<float>::CanBeUsed(const int& d) const {
  return d > 7;
 }
 template <>
-bool VSquareKernel<float>::UseMe(const int& d) const {
+bool VSquareKernel<float>::CanBeUsed(const int& d) const {
  return d > 7;
 }
 template <>
-bool VCopyKernel<float>::UseMe(const int& d) const {
+bool VCopyKernel<float>::CanBeUsed(const int& d) const {
  return d > 15;
 }
 template <>
-bool VBroadcastKernel<float>::UseMe(const int64_t& d) const {
+bool VBroadcastKernel<float>::CanBeUsed(const int64_t& d) const {
  return d > 127;
 }
 template <>
-bool VBroadcastKernel<double>::UseMe(const int64_t& attr) const {
+bool VBroadcastKernel<double>::CanBeUsed(const int64_t& attr) const {
  return true;
 }
 template <>
-bool VSigmoidKernel<float>::UseMe(const int& d) const {
+bool VSigmoidKernel<float>::CanBeUsed(const int& d) const {
  return d > 7;
 }
 template <>
-bool VTanhKernel<float>::UseMe(const int& d) const {
+bool VTanhKernel<float>::CanBeUsed(const int& d) const {
  return d > 7;
 }
 template <>
-bool SeqPoolKernel<float>::UseMe(const seq_pool_attr_t& attr) const {
+bool SeqPoolKernel<float>::CanBeUsed(const seq_pool_attr_t& attr) const {
  return true;
 }
 template <>
-bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
+bool SeqPoolKernel<double>::CanBeUsed(const seq_pool_attr_t& attr) const {
  return true;
 }
 template <>
-bool EmbSeqPoolKernel<float>::UseMe(const emb_seq_pool_attr_t& attr) const {
+bool EmbSeqPoolKernel<float>::CanBeUsed(const emb_seq_pool_attr_t& attr) const {
  return true;
 }
 template <>
-bool EmbSeqPoolKernel<double>::UseMe(const emb_seq_pool_attr_t& attr) const {
+bool EmbSeqPoolKernel<double>::CanBeUsed(
+    const emb_seq_pool_attr_t& attr) const {
  return true;
 }
 template <>
-bool SgdKernel<float>::UseMe(const sgd_attr_t& attr) const {
+bool SgdKernel<float>::CanBeUsed(const sgd_attr_t& attr) const {
  return true;
 }
 template <>
-bool SgdKernel<double>::UseMe(const sgd_attr_t& attr) const {
+bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const {
  return true;
 }
 template <>
-bool MatMulKernel<float>::UseMe(const matmul_attr_t& attr) const {
+bool MatMulKernel<float>::CanBeUsed(const matmul_attr_t& attr) const {
  return platform::MayIUse(platform::avx);
 }
 template <>
-bool MatMulKernel<double>::UseMe(const matmul_attr_t& attr) const {
+bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const {
  return true;
 }
 template <>
-bool SoftmaxKernel<float>::UseMe(const int& d) const {
+bool SoftmaxKernel<float>::CanBeUsed(const int& d) const {
  // tuned on avx2
  return platform::MayIUse(platform::avx) && d < 60;
 }
 #define AWALYS_USE_ME_WITH_DOUBLE(func)                      \
  template <>                                                \
-  bool func##Kernel<double>::UseMe(const int& d) const { \
+  bool func##Kernel<double>::CanBeUsed(const int& d) const { \
    return true;                                             \
  }
@@ -250,23 +251,23 @@ AWALYS_USE_ME_WITH_DOUBLE(Softmax);
 namespace mkl = paddle::operators::jit::more::mkl;
-#define REGISTER_MKL_KERNEL(key, func)                        \
+#define REGISTER_MKL_KERNEL(func)                                 \
-  REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel<float>, \
+  REGISTER_JITKERNEL_MORE(k##func, mkl, mkl::func##Kernel<float>, \
                          mkl::func##Kernel<double>)
-REGISTER_MKL_KERNEL(kMatMul, MatMul);
+REGISTER_MKL_KERNEL(MatMul);
-REGISTER_MKL_KERNEL(kVMul, VMul);
+REGISTER_MKL_KERNEL(VMul);
-REGISTER_MKL_KERNEL(kVAdd, VAdd);
+REGISTER_MKL_KERNEL(VAdd);
-REGISTER_MKL_KERNEL(kVScal, VScal);
+REGISTER_MKL_KERNEL(VScal);
-REGISTER_MKL_KERNEL(kVExp, VExp);
+REGISTER_MKL_KERNEL(VExp);
-REGISTER_MKL_KERNEL(kVSquare, VSquare);
+REGISTER_MKL_KERNEL(VSquare);
-REGISTER_MKL_KERNEL(kVCopy, VCopy);
+REGISTER_MKL_KERNEL(VCopy);
-REGISTER_MKL_KERNEL(kVBroadcast, VBroadcast);
+REGISTER_MKL_KERNEL(VBroadcast);
-REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
+REGISTER_MKL_KERNEL(VSigmoid);
-REGISTER_MKL_KERNEL(kVTanh, VTanh);
+REGISTER_MKL_KERNEL(VTanh);
-REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
+REGISTER_MKL_KERNEL(SeqPool);
-REGISTER_MKL_KERNEL(kEmbSeqPool, EmbSeqPool);
+REGISTER_MKL_KERNEL(EmbSeqPool);
-REGISTER_MKL_KERNEL(kSoftmax, Softmax);
+REGISTER_MKL_KERNEL(Softmax);
-REGISTER_MKL_KERNEL(kSgd, Sgd);
+REGISTER_MKL_KERNEL(Sgd);
 #undef REGISTER_MKL_KERNEL
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -175,41 +175,38 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
  }
 }
-#define DECLARE_MKL_KERNEL(name, tuples)                             \
+#define DECLARE_MKL_KERNEL(name)                                              \
  template <typename T>                                                       \
-  class name##Kernel : public KernelMore<tuples<T>> {                \
+  class name##Kernel : public KernelMore<name##Tuple<T>> {                    \
   public:                                                                    \
    name##Kernel() { this->func = name<T>; }                                  \
-    bool UseMe(const typename tuples<T>::attr_type&) const override; \
+    bool CanBeUsed(const typename name##Tuple<T>::attr_type&) const override; \
    const char* ImplType() const override { return "MKL"; }                   \
  }
 // ABCMNK
-DECLARE_MKL_KERNEL(MatMul, MatMulTuples);
+DECLARE_MKL_KERNEL(MatMul);
 // XYZN
-DECLARE_MKL_KERNEL(VMul, XYZNTuples);
+DECLARE_MKL_KERNEL(VMul);
-DECLARE_MKL_KERNEL(VAdd, XYZNTuples);
+DECLARE_MKL_KERNEL(VAdd);
 // AXYN
-DECLARE_MKL_KERNEL(VScal, AXYNTuples);
+DECLARE_MKL_KERNEL(VScal);
 // XYN
-DECLARE_MKL_KERNEL(VExp, XYNTuples);
+DECLARE_MKL_KERNEL(VExp);
-DECLARE_MKL_KERNEL(VSigmoid, XYNTuples);
+DECLARE_MKL_KERNEL(VSigmoid);
-DECLARE_MKL_KERNEL(VTanh, XYNTuples);
+DECLARE_MKL_KERNEL(VTanh);
-DECLARE_MKL_KERNEL(VSquare, XYNTuples);
+DECLARE_MKL_KERNEL(VSquare);
-DECLARE_MKL_KERNEL(VCopy, XYNTuples);
+DECLARE_MKL_KERNEL(VCopy);
-DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);
+// others
+DECLARE_MKL_KERNEL(SeqPool);
-DECLARE_MKL_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
+DECLARE_MKL_KERNEL(EmbSeqPool);
+DECLARE_MKL_KERNEL(Softmax);
-DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
+DECLARE_MKL_KERNEL(Sgd);
+DECLARE_MKL_KERNEL(VBroadcast);
-DECLARE_MKL_KERNEL(Sgd, SgdTuples);
-DECLARE_MKL_KERNEL(VBroadcast, VBroadcastTuples);
 #undef DECLARE_MKL_KERNEL

--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -17,51 +17,43 @@
 namespace refer = paddle::operators::jit::refer;
-#define REGISTER_REFER_KERNEL(key, func)                    \
+#define REGISTER_REFER_KERNEL(func)                             \
-  REGISTER_JITKERNEL_REFER(key, refer::func##Kernel<float>, \
+  REGISTER_JITKERNEL_REFER(k##func, refer::func##Kernel<float>, \
                           refer::func##Kernel<double>)
-REGISTER_REFER_KERNEL(kVMul, VMul);
+REGISTER_REFER_KERNEL(VMul);
-REGISTER_REFER_KERNEL(kVAdd, VAdd);
+REGISTER_REFER_KERNEL(VAdd);
-REGISTER_REFER_KERNEL(kVAddRelu, VAddRelu);
+REGISTER_REFER_KERNEL(VAddRelu);
-REGISTER_REFER_KERNEL(kVSub, VSub);
+REGISTER_REFER_KERNEL(VSub);
-REGISTER_REFER_KERNEL(kVScal, VScal);
+REGISTER_REFER_KERNEL(VScal);
-REGISTER_REFER_KERNEL(kVAddBias, VAddBias);
+REGISTER_REFER_KERNEL(VAddBias);
-REGISTER_REFER_KERNEL(kVRelu, VRelu);
+REGISTER_REFER_KERNEL(VRelu);
-REGISTER_REFER_KERNEL(kVCopy, VCopy);
+REGISTER_REFER_KERNEL(VCopy);
-REGISTER_REFER_KERNEL(kVIdentity, VIdentity);
+REGISTER_REFER_KERNEL(VIdentity);
-REGISTER_REFER_KERNEL(kVSquare, VSquare);
+REGISTER_REFER_KERNEL(VSquare);
-REGISTER_REFER_KERNEL(kVExp, VExp);
+REGISTER_REFER_KERNEL(VExp);
-REGISTER_REFER_KERNEL(kVSigmoid, VSigmoid);
+REGISTER_REFER_KERNEL(VSigmoid);
-REGISTER_REFER_KERNEL(kVTanh, VTanh);
+REGISTER_REFER_KERNEL(VTanh);
-REGISTER_REFER_KERNEL(kLSTMCtHt, LSTMCtHt);
+REGISTER_REFER_KERNEL(LSTMCtHt);
-REGISTER_REFER_KERNEL(kLSTMC1H1, LSTMC1H1);
+REGISTER_REFER_KERNEL(LSTMC1H1);
-REGISTER_REFER_KERNEL(kGRUH1, GRUH1);
+REGISTER_REFER_KERNEL(GRUH1);
-REGISTER_REFER_KERNEL(kGRUHtPart1, GRUHtPart1);
+REGISTER_REFER_KERNEL(GRUHtPart1);
-REGISTER_REFER_KERNEL(kGRUHtPart2, GRUHtPart2);
+REGISTER_REFER_KERNEL(GRUHtPart2);
-REGISTER_REFER_KERNEL(kCRFDecoding, CRFDecoding);
+REGISTER_REFER_KERNEL(CRFDecoding);
-REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm);
+REGISTER_REFER_KERNEL(LayerNorm);
+REGISTER_REFER_KERNEL(NCHW16CMulNC);
-REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC);
+REGISTER_REFER_KERNEL(SeqPool);
+REGISTER_REFER_KERNEL(MatMul);
-REGISTER_REFER_KERNEL(kSeqPool, SeqPool);
+REGISTER_REFER_KERNEL(HMax);
+REGISTER_REFER_KERNEL(HSum);
-REGISTER_REFER_KERNEL(kMatMul, MatMul);
+REGISTER_REFER_KERNEL(Softmax);
+REGISTER_REFER_KERNEL(EmbSeqPool);
-REGISTER_REFER_KERNEL(kHMax, HMax);
+REGISTER_REFER_KERNEL(Sgd);
-REGISTER_REFER_KERNEL(kHSum, HSum);
+REGISTER_REFER_KERNEL(VBroadcast);
-REGISTER_REFER_KERNEL(kSoftmax, Softmax);
-REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool);
-REGISTER_REFER_KERNEL(kSgd, Sgd);
-REGISTER_REFER_KERNEL(kVBroadcast, VBroadcast);
 #undef REGISTER_REFER_KERNEL
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -490,60 +490,54 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
  }
 }
-#define DECLARE_REFER_KERNEL(name, tuples)             \
+#define DECLARE_REFER_KERNEL(name)                          \
  template <typename T>                                     \
-  class name##Kernel : public ReferKernel<tuples<T>> { \
+  class name##Kernel : public ReferKernel<name##Tuple<T>> { \
   public:                                                  \
    name##Kernel() { this->func = name<T>; }                \
  }
 // const T* x, const T* y, T* z, int n
-DECLARE_REFER_KERNEL(VMul, XYZNTuples);
+DECLARE_REFER_KERNEL(VMul);
-DECLARE_REFER_KERNEL(VAdd, XYZNTuples);
+DECLARE_REFER_KERNEL(VAdd);
-DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples);
+DECLARE_REFER_KERNEL(VAddRelu);
-DECLARE_REFER_KERNEL(VSub, XYZNTuples);
+DECLARE_REFER_KERNEL(VSub);
 // const T* a, const T* x, T* y, int n
-DECLARE_REFER_KERNEL(VScal, AXYNTuples);
+DECLARE_REFER_KERNEL(VScal);
-DECLARE_REFER_KERNEL(VAddBias, AXYNTuples);
+DECLARE_REFER_KERNEL(VAddBias);
 // const T* x, T* y, int n
-DECLARE_REFER_KERNEL(VRelu, XYNTuples);
+DECLARE_REFER_KERNEL(VRelu);
-DECLARE_REFER_KERNEL(VIdentity, XYNTuples);
+DECLARE_REFER_KERNEL(VIdentity);
-DECLARE_REFER_KERNEL(VExp, XYNTuples);
+DECLARE_REFER_KERNEL(VExp);
-DECLARE_REFER_KERNEL(VSigmoid, XYNTuples);
+DECLARE_REFER_KERNEL(VSigmoid);
-DECLARE_REFER_KERNEL(VTanh, XYNTuples);
+DECLARE_REFER_KERNEL(VTanh);
-DECLARE_REFER_KERNEL(VSquare, XYNTuples);
+DECLARE_REFER_KERNEL(VSquare);
-DECLARE_REFER_KERNEL(VCopy, XYNTuples);
+DECLARE_REFER_KERNEL(VCopy);
 // lstm_t*, const lstm_attr_t*
-DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples);
+DECLARE_REFER_KERNEL(LSTMCtHt);
-DECLARE_REFER_KERNEL(LSTMC1H1, LSTMTuples);
+DECLARE_REFER_KERNEL(LSTMC1H1);
 // gru_t*, const gru_attr_t*
-DECLARE_REFER_KERNEL(GRUH1, GRUTuples);
+DECLARE_REFER_KERNEL(GRUH1);
-DECLARE_REFER_KERNEL(GRUHtPart1, GRUTuples);
+DECLARE_REFER_KERNEL(GRUHtPart1);
-DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples);
+DECLARE_REFER_KERNEL(GRUHtPart2);
-DECLARE_REFER_KERNEL(CRFDecoding, CRFDecodingTuples);
+DECLARE_REFER_KERNEL(HMax);
-DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples);
+DECLARE_REFER_KERNEL(HSum);
-DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples);
+// others
+DECLARE_REFER_KERNEL(CRFDecoding);
-DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples);
+DECLARE_REFER_KERNEL(LayerNorm);
+DECLARE_REFER_KERNEL(NCHW16CMulNC);
-DECLARE_REFER_KERNEL(MatMul, MatMulTuples);
+DECLARE_REFER_KERNEL(SeqPool);
+DECLARE_REFER_KERNEL(MatMul);
-DECLARE_REFER_KERNEL(HMax, XRNTuples);
+DECLARE_REFER_KERNEL(Softmax);
-DECLARE_REFER_KERNEL(HSum, XRNTuples);
+DECLARE_REFER_KERNEL(EmbSeqPool);
+DECLARE_REFER_KERNEL(Sgd);
-DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples);
+DECLARE_REFER_KERNEL(VBroadcast);
-DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
-DECLARE_REFER_KERNEL(Sgd, SgdTuples);
-DECLARE_REFER_KERNEL(VBroadcast, VBroadcastTuples);
 #undef DECLARE_REFER_KERNEL

--- a/paddle/fluid/operators/jit/registry.h
+++ b/paddle/fluid/operators/jit/registry.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <tuple>
 #include <type_traits>
+#include <utility>  // for std::move
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/operators/jit/kernel_pool.h"
 #include "paddle/fluid/platform/place.h"
@@ -49,7 +50,7 @@ struct JitKernelRegistrarFunctor<Pool, PlaceType, false, I, KernelImpls...> {
  void operator()(KernelType kt) const {
    KernelKey kkey(kt, PlaceType());
-    Pool().Instance().Insert(kkey,
+    Pool::Instance().Insert(kkey,
                            std::move(make_unique<const KERNEL_IMPL_TYPE>()));
    constexpr auto size = std::tuple_size<std::tuple<KernelImpls...>>::value;
    JitKernelRegistrarFunctor<Pool, PlaceType, I + 1 == size, I + 1,

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
+#include <iostream>
 #include <random>
 #include <string>
 #include <vector>
@@ -64,17 +65,47 @@ std::vector<int> TestSizes() {
 namespace jit = paddle::operators::jit;
 using CPUPlace = paddle::platform::CPUPlace;
-template <typename KernelTuples, typename... Args>
+template <typename KernelTuple, typename PlaceType, typename Tester,
-struct TestFuncWithRefer {
+          typename... Args>
-  void operator()(const typename KernelTuples::func_type tgt, Args... args) {
+void TestAllImpls(const typename KernelTuple::attr_type& attr,
-    LOG(FATAL) << "Should specify this function.";
+                  const Tester& verifier, const Args&... args) {
+  auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
+  for (auto f : funcs) {
+    VLOG(10) << "Test Kernel " << f.first;
+    verifier(f.second, args...);
  }
-};
+}
-template <typename T>
+template <typename KernelTuple, typename PlaceType>
-struct TestFuncWithRefer<jit::XYZNTuples<T>, std::vector<T>, std::vector<T>,
+void TestKernelXYZN() {
-                         std::vector<T>> {
+  using T = typename KernelTuple::data_type;
-  void operator()(const typename jit::XYZNTuples<T>::func_type tgt,
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  for (int d : TestSizes()) {
+    auto ref = jit::GetReferFunc<KernelTuple>();
+    EXPECT_TRUE(ref != nullptr);
+    std::vector<T> x(d), y(d), zref(d);
+    RandomVec<T>(d, x.data());
+    RandomVec<T>(d, y.data());
+    std::vector<T> xinp(d), yinp(d);  // inplace test
+    std::copy(x.begin(), x.end(), xinp.begin());
+    std::copy(y.begin(), y.end(), yinp.begin());
+    const T* x_data = x.data();
+    const T* y_data = y.data();
+    T* zref_data = zref.data();
+    T* xinp_data = xinp.data();
+    T* yinp_data = yinp.data();
+    // test refer code inplace
+    ref(x_data, y_data, zref_data, d);
+    ref(x_data, yinp_data, yinp_data, d);
+    ref(xinp_data, y_data, xinp_data, d);
+    ExpectEQ<T>(xinp_data, zref_data, d);
+    ExpectEQ<T>(yinp_data, zref_data, d);
+    auto verifier = [](const typename KernelTuple::func_type tgt,
                       const std::vector<T>& x, const std::vector<T>& y,
                       const std::vector<T>& zref) {
      EXPECT_TRUE(tgt != nullptr);
@@ -98,13 +129,35 @@ struct TestFuncWithRefer<jit::XYZNTuples<T>, std::vector<T>, std::vector<T>,
      std::copy(y.begin(), y.end(), ztgt.begin());
      tgt(x_data, ztgt_data, ztgt_data, d);
      ExpectEQ<T>(ztgt_data, zref_data, d);
+    };
+    TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, y, zref);
  }
-};
+}
-template <typename T>
+template <typename KernelTuple, typename PlaceType>
-struct TestFuncWithRefer<jit::AXYNTuples<T>, T, std::vector<T>,
+void TestKernelAXYN() {
-                         std::vector<T>> {
+  using T = typename KernelTuple::data_type;
-  void operator()(const typename jit::AXYNTuples<T>::func_type tgt, const T a,
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  for (int d : TestSizes()) {
+    auto ref = jit::GetReferFunc<KernelTuple>();
+    EXPECT_TRUE(ref != nullptr);
+    const T a = static_cast<T>(3);
+    std::vector<T> x(d), yref(d);
+    std::vector<T> xinp(d);  // inplace test
+    RandomVec<T>(d, x.data());
+    std::copy(x.begin(), x.end(), xinp.begin());
+    const T* x_data = x.data();
+    T* yref_data = yref.data();
+    T* xinp_data = xinp.data();
+    // test refer code inplace
+    ref(&a, x_data, yref_data, d);
+    ref(&a, xinp_data, xinp_data, d);
+    ExpectEQ<T>(xinp_data, yref_data, d);
+    auto verifier = [](const typename KernelTuple::func_type tgt, const T a,
                       const std::vector<T>& x, const std::vector<T>& yref) {
      EXPECT_TRUE(tgt != nullptr);
      EXPECT_EQ(yref.size(), x.size());
@@ -120,66 +173,32 @@ struct TestFuncWithRefer<jit::AXYNTuples<T>, T, std::vector<T>,
      std::copy(x.begin(), x.end(), ytgt.begin());
      tgt(&a, ytgt_data, ytgt_data, d);
      ExpectEQ<T>(ytgt_data, yref_data, d);
+    };
+    TestAllImpls<KernelTuple, PlaceType>(d, verifier, a, x, yref);
  }
-};
+}
-template <typename T>
+template <typename KernelTuple, typename PlaceType>
-struct TestFuncWithRefer<jit::SoftmaxTuples<T>, std::vector<T>, std::vector<T>,
+void TestKernelXYN() {
-                         int, int> {
+  using T = typename KernelTuple::data_type;
-  void operator()(const typename jit::SoftmaxTuples<T>::func_type tgt,
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-                  const std::vector<T>& x, const std::vector<T>& yref, int n,
+  for (int d : TestSizes()) {
-                  int bs) {
+    auto ref = jit::GetReferFunc<KernelTuple>();
-    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_TRUE(ref != nullptr);
-    EXPECT_EQ(yref.size(), x.size());
-    EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
-    const T* x_data = x.data();
-    const T* yref_data = yref.data();
-    std::vector<T> ytgt(n * bs);
-    T* ytgt_data = ytgt.data();
-    // test normal
-    tgt(x_data, ytgt_data, n, bs);
-    ExpectEQ<T>(ytgt_data, yref_data, n * bs);
-    // test inplace x
-    std::copy(x.begin(), x.end(), ytgt.begin());
-    tgt(ytgt_data, ytgt_data, n, bs);
-    ExpectEQ<T>(ytgt_data, yref_data, n * bs);
-  }
-};
-template <typename T>
+    std::vector<T> x(d), yref(d);
-struct TestFuncWithRefer<jit::XRNTuples<T>, std::vector<T>, T> {
+    std::vector<T> xinp(d);  // inplace test
-  void operator()(const typename jit::XRNTuples<T>::func_type tgt,
+    RandomVec<T>(d, x.data());
-                  const std::vector<T>& x, const T ref_res) {
+    std::copy(x.begin(), x.end(), xinp.begin());
-    EXPECT_TRUE(tgt != nullptr);
-    T tgt_res;
-    tgt(x.data(), &tgt_res, x.size());
-    ExpectEQ<T>(&tgt_res, &ref_res, 1);
-  }
-};
-template <typename T>
-struct TestFuncWithRefer<jit::VBroadcastTuples<T>, std::vector<T>,
-                         std::vector<T>, int64_t,
-                         typename jit::VBroadcastTuples<T>::attr_type> {
-  void operator()(const typename jit::VBroadcastTuples<T>::func_type tgt,
-                  const std::vector<T>& x, const std::vector<T>& yref,
-                  int64_t h,
-                  const typename jit::VBroadcastTuples<T>::attr_type& attr) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(x.size(), static_cast<size_t>(attr));
-    EXPECT_EQ(yref.size(), x.size() * h);
-    std::vector<T> y(yref.size());
    const T* x_data = x.data();
-    const T* yref_data = yref.data();
+    T* yref_data = yref.data();
-    T* y_data = y.data();
+    T* xinp_data = xinp.data();
-    tgt(x_data, y_data, h, attr);
+    // test refer code inplace
-    ExpectEQ<T>(y_data, yref_data, yref.size());
+    ref(x_data, yref_data, d);
-  }
+    ref(xinp_data, xinp_data, d);
-};
+    ExpectEQ<T>(xinp_data, yref_data, d);
+    auto verifier = [](const typename KernelTuple::func_type tgt,
-template <typename T>
-struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
-  void operator()(const typename jit::XYNTuples<T>::func_type tgt,
                       const std::vector<T>& x, const std::vector<T>& yref) {
      EXPECT_TRUE(tgt != nullptr);
      EXPECT_EQ(yref.size(), x.size());
@@ -195,18 +214,86 @@ struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
      std::copy(x.begin(), x.end(), ytgt.begin());
      tgt(ytgt_data, ytgt_data, d);
      ExpectEQ<T>(ytgt_data, yref_data, d);
+    };
+    TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, yref);
  }
-};
+}
-template <typename T>
+template <typename KernelTuple, typename PlaceType>
-struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
+void TestKernelXRN() {
-                         std::vector<T>, std::vector<T>, std::vector<T>,
+  using T = typename KernelTuple::data_type;
-                         typename jit::LSTMTuples<T>::attr_type> {
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  void operator()(const typename jit::LSTMTuples<T>::func_type tgt,
+  auto last_acc = FLAGS_acc;
+  FLAGS_acc = 1e-4;
+  for (int d : TestSizes()) {
+    auto ref = jit::GetReferFunc<KernelTuple>();
+    EXPECT_TRUE(ref != nullptr);
+    std::vector<T> x(d);
+    RandomVec<T>(d, x.data());
+    T ref_res;
+    ref(x.data(), &ref_res, d);
+    auto verifier = [](const typename KernelTuple::func_type tgt,
+                       const std::vector<T>& x, const T ref_res) {
+      EXPECT_TRUE(tgt != nullptr);
+      T tgt_res;
+      tgt(x.data(), &tgt_res, x.size());
+      ExpectEQ<T>(&tgt_res, &ref_res, 1);
+    };
+    TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, ref_res);
+  }
+  FLAGS_acc = last_acc;
+}
+template <typename KernelTuple, typename PlaceType>
+void TestKernelLSTM() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
+  auto test_sizes = TestSizes();
+  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
+  for (int d : test_sizes) {
+    for (bool use_peephole : {true, false}) {
+      for (auto& act_gate : all_acts) {
+        for (auto& act_cand : all_acts) {
+          for (auto& act_cell : all_acts) {
+            const jit::lstm_attr_t attr(
+                d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand),
+                jit::to_kerneltype(act_cell), use_peephole);
+            auto ref = jit::GetReferFunc<KernelTuple>();
+            EXPECT_TRUE(ref != nullptr);
+            std::vector<T> xsrc(4 * d), wp(3 * d), ct_1(d);
+            std::vector<T> ct_ref(d), ht_ref(d), checked(2 * d);
+            RandomVec<T>(4 * d, xsrc.data());
+            RandomVec<T>(3 * d, wp.data(), -1.f, 1.f);
+            RandomVec<T>(d, ct_1.data(), -1.f, 1.f);
+            // x could be changed after compute, so copy to save src
+            std::vector<T> x(xsrc.size());
+            std::copy(xsrc.begin(), xsrc.end(), x.begin());
+            const T* ct_1_data = ct_1.data();
+            const T* wp_data = wp.data();
+            T* x_data = x.data();
+            T* checked_data = checked.data();
+            T* ct_ref_data = ct_ref.data();
+            T* ht_ref_data = ht_ref.data();
+            jit::lstm_t step;
+            step.gates = x_data;
+            step.ct_1 = ct_1_data;
+            step.ct = ct_ref_data;
+            step.ht = ht_ref_data;
+            if (use_peephole) {
+              step.wp = wp_data;
+              step.checked = checked_data;
+            }
+            ref(&step, &attr);
+            VLOG(10) << attr;
+            auto verifier = [](
+                const typename KernelTuple::func_type tgt,
                const std::vector<T>& xsrc, const std::vector<T>& wp,
                const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
                const std::vector<T>& ht_ref,
-                  const typename jit::LSTMTuples<T>::attr_type& attr) {
+                const typename KernelTuple::attr_type& attr) {
              EXPECT_TRUE(tgt != nullptr);
              EXPECT_EQ(ct_ref.size(), ht_ref.size());
              EXPECT_EQ(ct_1.size(), ht_ref.size());
@@ -215,7 +302,8 @@ struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
              // x could be changed after compute, so copy to save src
              int d = ht_ref.size();
-    std::vector<T> x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size());
+              std::vector<T> x(xsrc.size()), ct(ct_ref.size()),
+                  ht(ht_ref.size());
              std::vector<T> checked(2 * d);
              std::copy(xsrc.begin(), xsrc.end(), x.begin());
@@ -241,17 +329,50 @@ struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
              tgt(&step, &attr);
              ExpectEQ<T>(ct_data, ct_ref_data, d);
              ExpectEQ<T>(ht_data, ht_ref_data, d);
+            };
+            TestAllImpls<KernelTuple, PlaceType>(attr, verifier, xsrc, wp, ct_1,
+                                                 ct_ref, ht_ref, attr);
+          }
        }
-};
+      }
+    }
+  }
+}
-template <typename T>
+template <typename KernelTuple, typename PlaceType>
-struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
+void TestKernelGRU() {
-                         std::vector<T>,
+  using T = typename KernelTuple::data_type;
-                         typename jit::GRUTuples<T>::attr_type> {
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  void operator()(const typename jit::GRUTuples<T>::func_type tgt,
+  std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
-                  const std::vector<T>& xsrc, const std::vector<T>& ht_1,
+  auto test_sizes = TestSizes();
+  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
+  for (int d : test_sizes) {
+    for (auto& act_gate : all_acts) {
+      for (auto& act_cand : all_acts) {
+        const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate),
+                                   jit::to_kerneltype(act_cand));
+        auto ref = jit::GetReferFunc<KernelTuple>();
+        EXPECT_TRUE(ref != nullptr);
+        std::vector<T> xsrc(3 * d), ht_1(d), ht_ref(d);
+        RandomVec<T>(3 * d, xsrc.data());
+        RandomVec<T>(d, ht_1.data());
+        // x could be changed after compute, so copy to save src
+        std::vector<T> x(xsrc.size());
+        std::copy(xsrc.begin(), xsrc.end(), x.begin());
+        const T* ht_1_data = ht_1.data();
+        T* x_data = x.data();
+        T* ht_ref_data = ht_ref.data();
+        jit::gru_t step;
+        step.gates = x_data;
+        step.ht_1 = ht_1_data;
+        step.ht = ht_ref_data;
+        ref(&step, &attr);
+        VLOG(10) << attr;
+        auto verifier = [](const typename KernelTuple::func_type tgt,
+                           const std::vector<T>& xsrc,
+                           const std::vector<T>& ht_1,
                           const std::vector<T>& ht_ref,
-                  const typename jit::GRUTuples<T>::attr_type& attr) {
+                           const typename KernelTuple::attr_type& attr) {
          EXPECT_TRUE(tgt != nullptr);
          EXPECT_EQ(ht_1.size(), ht_ref.size());
          EXPECT_EQ(xsrc.size(), 3 * ht_ref.size());
@@ -270,131 +391,125 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
          step.ht = ht_data;
          tgt(&step, &attr);
          ExpectEQ<T>(ht_data, ht_ref_data, d);
+        };
+        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, xsrc, ht_1, ht_ref,
+                                             attr);
      }
-};
-template <typename T>
-struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
-                         typename jit::SeqPoolTuples<T>::attr_type> {
-  void operator()(const typename jit::SeqPoolTuples<T>::func_type tgt,
-                  const std::vector<T>& x, const std::vector<T>& yref,
-                  const typename jit::SeqPoolTuples<T>::attr_type& attr) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
-    int w = yref.size();
-    std::vector<T> y(w);
-    const T* x_data = x.data();
-    const T* yref_data = yref.data();
-    T* y_data = y.data();
-    tgt(x_data, y_data, &attr);
-    ExpectEQ<T>(y_data, yref_data, w);
    }
-};
-template <typename T>
-struct TestFuncWithRefer<jit::EmbSeqPoolTuples<T>, std::vector<T>,
-                         std::vector<int64_t>, std::vector<T>,
-                         typename jit::EmbSeqPoolTuples<T>::attr_type> {
-  void operator()(const typename jit::EmbSeqPoolTuples<T>::func_type tgt,
-                  const std::vector<T>& table, const std::vector<int64_t>& idx,
-                  const std::vector<T>& oref,
-                  const typename jit::EmbSeqPoolTuples<T>::attr_type& attr) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(table.size(),
-              static_cast<size_t>(attr.table_height * attr.table_width));
-    EXPECT_EQ(idx.size(),
-              static_cast<size_t>(attr.index_height * attr.index_width));
-    EXPECT_EQ(oref.size(),
-              static_cast<size_t>(attr.table_width * attr.index_width));
-    const T* table_data = table.data();
-    const int64_t* idx_data = idx.data();
-    const T* oref_data = oref.data();
-    int o_w = oref.size();
-    std::vector<T> out(o_w);
-    T* o_data = out.data();
-    tgt(table_data, idx_data, o_data, &attr);
-    ExpectEQ<T>(o_data, oref_data, o_w);
  }
-};
+}
-template <typename T>
+template <typename KernelTuple, typename PlaceType>
-struct TestFuncWithRefer<jit::SgdTuples<T>, T, std::vector<T>, std::vector<T>,
+void TestKernelNCHW16CMulNC() {
-                         std::vector<int64_t>, std::vector<T>,
+  using T = typename KernelTuple::data_type;
-                         typename jit::SgdTuples<T>::attr_type> {
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  void operator()(const typename jit::SgdTuples<T>::func_type tgt, const T lr,
+  const int n = 3, c = 16 * 4, h = 10, w = 10;
-                  const std::vector<T>& param, const std::vector<T>& grad,
+  auto ref = jit::GetReferFunc<KernelTuple>();
-                  const std::vector<int64_t>& rows, const std::vector<T>& oref,
+  EXPECT_TRUE(ref != nullptr);
-                  const typename jit::SgdTuples<T>::attr_type& attr) {
+  int sz = n * c * h * w;
+  std::vector<T> x(sz), y(n * c), zref(sz);
+  std::vector<T> ztgt(sz), zjit(sz);
+  RandomVec<T>(sz, x.data());
+  RandomVec<T>(n * c, y.data());
+  const T* x_data = x.data();
+  const T* y_data = y.data();
+  T* zref_data = zref.data();
+  T* ztgt_data = ztgt.data();
+  T* zjit_data = zjit.data();
+  constexpr int simd_width = ZMM_FLOAT_BLOCK;
+  int C = c / simd_width;
+  auto tgt = jit::KernelFuncs<KernelTuple, PlaceType>::Cache().At(0);
+  auto funcs = jit::GetAllCandidateFuncs<KernelTuple, PlaceType>(0);
+  EXPECT_GT(funcs.size(), 0UL);
+  auto jitcode = funcs[0];
  EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(param.size(),
-              static_cast<size_t>(attr.param_height * attr.param_width));
-    EXPECT_EQ(grad.size(),
-              static_cast<size_t>(attr.grad_height * attr.grad_width));
-    EXPECT_EQ(rows.size(), static_cast<size_t>(attr.selected_rows_size));
-    EXPECT_EQ(param.size(), oref.size());
-    const T* param_data = param.data();
-    const T* grad_data = grad.data();
-    const int64_t* rows_data = rows.data();
-    const T* oref_data = oref.data();
-    std::vector<T> out(oref.size());
+  if (std::is_same<T, float>::value &&
-    T* o_data = out.data();
+      paddle::platform::MayIUse(paddle::platform::avx512f)) {
-    tgt(&lr, param_data, grad_data, rows_data, o_data, &attr);
+    EXPECT_TRUE(jitcode != nullptr);
-    // only the selected rows should be equal
-    for (size_t i = 0; i < rows.size(); ++i) {
-      ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
-                  oref_data + rows[i] * attr.grad_width, attr.grad_width);
  }
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < C; ci++) {
+      auto ptr_x =
+          x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+      auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
+      auto ptr_zref =
+          zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+      auto ptr_ztgt =
+          ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-    // inplace
+      ref(ptr_x, ptr_y, ptr_zref, h, w);
-    std::copy(param.begin(), param.end(), out.begin());
+      tgt(ptr_x, ptr_y, ptr_ztgt, h, w);
-    tgt(&lr, o_data, grad_data, rows_data, o_data, &attr);
-    for (size_t i = 0; i < rows.size(); ++i) {
+      if (jitcode) {
-      ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
+        auto ptr_zjit =
-                  oref_data + rows[i] * attr.grad_width, attr.grad_width);
+            zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+        jitcode(ptr_x, ptr_y, ptr_zjit, h, w);
      }
    }
-};
-template <typename T>
-struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>,
-                         typename jit::MatMulTuples<T>::attr_type> {
-  void operator()(const typename jit::MatMulTuples<T>::func_type tgt,
-                  const std::vector<T>& a, const std::vector<T>& b,
-                  const std::vector<T>& cref,
-                  const typename jit::MatMulTuples<T>::attr_type& attr) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
-    EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
-    EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
-    std::vector<T> c(cref.size());
-    const T* a_data = a.data();
-    const T* b_data = b.data();
-    const T* cref_data = cref.data();
-    T* c_data = c.data();
-    tgt(a_data, b_data, c_data, &attr);
-    ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
  }
-};
+  ExpectEQ<T>(ztgt_data, zref_data, sz);
+  if (jitcode) {
+    ExpectEQ<T>(zjit_data, zref_data, sz);
+  }
+}
-template <typename T>
+template <typename KernelTuple, typename PlaceType>
-struct TestFuncWithRefer<jit::LayerNormTuples<T>, std::vector<T>,
+void TestKernelLayerNorm() {
-                         std::vector<T>, std::vector<T>, std::vector<T>,
+  using T = typename KernelTuple::data_type;
-                         std::vector<T>, std::vector<T>, int, float, int> {
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  void operator()(const typename jit::LayerNormTuples<T>::func_type tgt,
+  const T epsilon = 9.99999975e-06;
-                  std::vector<T>& x, std::vector<T>& outref,  // NOLINT
+  for (int n : {1, 2, 10}) {
-                  std::vector<T>& mean, std::vector<T>& var,  // NOLINT
+    for (int x_dim_0 : {1, 9, 17, 50}) {
-                  const std::vector<T>& scale, const std::vector<T>& bias,
+      int left = n * x_dim_0;
-                  int left, const float epsilon, int right) {
+      for (int x_dim_1 : TestSizes()) {
+        int right = x_dim_1;
+        auto ref = jit::GetReferFunc<KernelTuple>();
+        EXPECT_TRUE(ref != nullptr);
+        int sz = left * right;
+        std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
+            outref(sz);
+        RandomVec<T>(sz, x.data());
+        RandomVec<T>(left, mean.data());
+        RandomVec<T>(left, var.data());
+        RandomVec<T>(right, scale.data());
+        RandomVec<T>(right, bias.data());
+        const T* scale_data = scale.data();
+        const T* bias_data = bias.data();
+        T* x_data = x.data();
+        T* mean_data = mean.data();
+        T* var_data = var.data();
+        T* outref_data = outref.data();
+        ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
+            left, epsilon, right);
+        auto verifier = [](
+            const typename KernelTuple::func_type tgt, const std::vector<T>& x_,
+            const std::vector<T>& outref_, const std::vector<T>& mean_,
+            const std::vector<T>& var_, const std::vector<T>& scale,
+            const std::vector<T>& bias, const int& left, const float& epsilon,
+            const typename KernelTuple::attr_type& right) {
          EXPECT_TRUE(tgt != nullptr);
+          std::vector<T> outtgt(outref_.size());
+          std::vector<T> x(x_.size());
+          std::vector<T> mean(mean_.size());
+          std::vector<T> var(var_.size());
+          std::vector<T> outref(outref_.size());
+          std::copy(x_.begin(), x_.end(), x.begin());
+          std::copy(mean_.begin(), mean_.end(), mean.begin());
+          std::copy(var_.begin(), var_.end(), var.begin());
+          std::copy(outref_.begin(), outref_.end(), outref.begin());
          EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
          EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
          EXPECT_EQ(mean.size(), static_cast<size_t>(left));
          EXPECT_EQ(var.size(), static_cast<size_t>(left));
          EXPECT_EQ(scale.size(), static_cast<size_t>(right));
          EXPECT_EQ(bias.size(), static_cast<size_t>(right));
-    std::vector<T> outtgt(outref.size());
          const T* scale_data = scale.data();
          const T* bias_data = bias.data();
          T* x_data = x.data();
@@ -402,263 +517,69 @@ struct TestFuncWithRefer<jit::LayerNormTuples<T>, std::vector<T>,
          T* var_data = var.data();
          T* outref_data = outref.data();
          T* outtgt_data = outtgt.data();
+          tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data,
-    tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left,
+              left, epsilon, right);
-        epsilon, right);
          ExpectEQ<T>(outtgt_data, outref_data, left * right);
+        };
+        TestAllImpls<KernelTuple, PlaceType>(right, verifier, x, outref, mean,
+                                             var, scale, bias, left, epsilon,
+                                             right);
      }
-};
+    }
+  }
+}
-template <typename T>
+template <typename KernelTuple, typename PlaceType>
-struct TestFuncWithRefer<jit::CRFDecodingTuples<T>, int, std::vector<T>,
+void TestKernelCRFDecoding() {
-                         std::vector<T>, std::vector<T>, std::vector<int>,
+  using T = typename KernelTuple::data_type;
-                         int> {
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  void operator()(const typename jit::CRFDecodingTuples<T>::func_type tgt,
+  constexpr int state_trans_base_idx = 2;
-                  const int seq_len, const std::vector<T>& x,
+  auto test_sizes = TestSizes();
-                  const std::vector<T>& w, std::vector<T>& alpharef,  // NOLINT
+  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000));
-                  std::vector<int>& trackref, int tag_num) {          // NOLINT
+  for (int seq_len : {1, 11, 17, 50}) {
+    for (int tag_num : test_sizes) {
+      auto ref = jit::GetReferFunc<KernelTuple>();
+      EXPECT_TRUE(ref != nullptr);
+      int x_sz = seq_len * tag_num;
+      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
+      std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
+      std::vector<int> trackref(x_sz);
+      RandomVec<T>(x_sz, x.data());
+      RandomVec<T>(w_sz, w.data());
+      ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
+          trackref.data(), tag_num);
+      auto verifier = [](
+          const typename KernelTuple::func_type tgt, const int& seq_len,
+          const std::vector<T>& x, const std::vector<T>& w,
+          const std::vector<T>& alpharef, const std::vector<int>& trackref,
+          const typename KernelTuple::attr_type& tag_num) {
        constexpr int state_trans_base_idx = 2;
        EXPECT_TRUE(tgt != nullptr);
        EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
-    EXPECT_EQ(w.size(),
+        EXPECT_EQ(w.size(), static_cast<size_t>(
-              static_cast<size_t>((tag_num + state_trans_base_idx) * tag_num));
+                                (tag_num + state_trans_base_idx) * tag_num));
        EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num));
        EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num));
        std::vector<T> alphatgt(alpharef.size());
        std::vector<int> tracktgt(trackref.size());
+        memcpy(tracktgt.data(), trackref.data(), tag_num * sizeof(int));
-    memcpy(trackref.data(), tracktgt.data(), tag_num * sizeof(int));
        tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(),
            tracktgt.data(), tag_num);
        ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num);
        ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num);
-  }
+      };
-};
+      TestAllImpls<KernelTuple, PlaceType>(tag_num, verifier, seq_len, x, w,
+                                           alpharef, trackref, tag_num);
-template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
-          typename... Args>
-void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
-  TestFuncWithRefer<KernelTuples, Args...> test;
-  // test jitcode
-  auto jitcode = jit::GetJitCode<KT, KernelTuples, PlaceType>(attr);
-  if (jitcode) {
-    VLOG(10) << "Test Jitcode Kernel ";
-    test(jitcode, args...);
-  }
-  // test all impls in more
-  jit::KernelKey kkey(KT, PlaceType());
-  auto& pool = jit::KernelPool().Instance().AllKernels();
-  auto iter = pool.find(kkey);
-  if (iter != pool.end()) {
-    auto& impls = iter->second;
-    for (auto& impl : impls) {
-      auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
-      if (i && i->UseMe(attr)) {
-        auto more = i->GetFunc();
-        VLOG(10) << "Test More Kernel : " << i->ImplType();
-        test(more, args...);
-      }
-    }
-  }
-  // test result from Get function
-  // VLOG(10) << "Test Get function ";
-  auto tgt = jit::Get<KT, KernelTuples, PlaceType>(attr);
-  test(tgt, args...);
-}
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelXYZNTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  for (int d : TestSizes()) {
-    auto ref = jit::GetRefer<KT, jit::XYZNTuples<T>>();
-    EXPECT_TRUE(ref != nullptr);
-    std::vector<T> x(d), y(d), zref(d);
-    RandomVec<T>(d, x.data());
-    RandomVec<T>(d, y.data());
-    std::vector<T> xinp(d), yinp(d);  // inplace test
-    std::copy(x.begin(), x.end(), xinp.begin());
-    std::copy(y.begin(), y.end(), yinp.begin());
-    const T* x_data = x.data();
-    const T* y_data = y.data();
-    T* zref_data = zref.data();
-    T* xinp_data = xinp.data();
-    T* yinp_data = yinp.data();
-    // test refer code inplace
-    ref(x_data, y_data, zref_data, d);
-    ref(x_data, yinp_data, yinp_data, d);
-    ref(xinp_data, y_data, xinp_data, d);
-    ExpectEQ<T>(xinp_data, zref_data, d);
-    ExpectEQ<T>(yinp_data, zref_data, d);
-    TestAllImpls<KT, jit::XYZNTuples<T>, PlaceType, std::vector<T>,
-                 std::vector<T>, std::vector<T>>(d, x, y, zref);
-  }
-}
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelAXYNTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  for (int d : TestSizes()) {
-    auto ref = jit::GetRefer<KT, jit::AXYNTuples<T>>();
-    EXPECT_TRUE(ref != nullptr);
-    const T a = static_cast<T>(3);
-    std::vector<T> x(d), yref(d);
-    std::vector<T> xinp(d);  // inplace test
-    RandomVec<T>(d, x.data());
-    std::copy(x.begin(), x.end(), xinp.begin());
-    const T* x_data = x.data();
-    T* yref_data = yref.data();
-    T* xinp_data = xinp.data();
-    // test refer code inplace
-    ref(&a, x_data, yref_data, d);
-    ref(&a, xinp_data, xinp_data, d);
-    ExpectEQ<T>(xinp_data, yref_data, d);
-    TestAllImpls<KT, jit::AXYNTuples<T>, PlaceType, T, std::vector<T>,
-                 std::vector<T>>(d, a, x, yref);
-  }
-}
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelXRNTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  auto last_acc = FLAGS_acc;
-  FLAGS_acc = 1e-4;
-  for (int d : TestSizes()) {
-    auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>();
-    EXPECT_TRUE(ref != nullptr);
-    std::vector<T> x(d);
-    RandomVec<T>(d, x.data());
-    T ref_res;
-    ref(x.data(), &ref_res, d);
-    TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x,
-                                                                      ref_res);
-  }
-  FLAGS_acc = last_acc;
-}
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelXYNTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  for (int d : TestSizes()) {
-    auto ref = jit::GetRefer<KT, jit::XYNTuples<T>>();
-    EXPECT_TRUE(ref != nullptr);
-    std::vector<T> x(d), yref(d);
-    std::vector<T> xinp(d);  // inplace test
-    RandomVec<T>(d, x.data());
-    std::copy(x.begin(), x.end(), xinp.begin());
-    const T* x_data = x.data();
-    T* yref_data = yref.data();
-    T* xinp_data = xinp.data();
-    // test refer code inplace
-    ref(x_data, yref_data, d);
-    ref(xinp_data, xinp_data, d);
-    ExpectEQ<T>(xinp_data, yref_data, d);
-    TestAllImpls<KT, jit::XYNTuples<T>, PlaceType, std::vector<T>,
-                 std::vector<T>>(d, x, yref);
-  }
-}
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelLSTMTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
-  for (int d : test_sizes) {
-    for (bool use_peephole : {true, false}) {
-      for (auto& act_gate : all_acts) {
-        for (auto& act_cand : all_acts) {
-          for (auto& act_cell : all_acts) {
-            const jit::lstm_attr_t attr(
-                d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand),
-                jit::to_kerneltype(act_cell), use_peephole);
-            auto ref = jit::GetRefer<KT, jit::LSTMTuples<T>>();
-            EXPECT_TRUE(ref != nullptr);
-            std::vector<T> xsrc(4 * d), wp(3 * d), ct_1(d);
-            std::vector<T> ct_ref(d), ht_ref(d), checked(2 * d);
-            RandomVec<T>(4 * d, xsrc.data());
-            RandomVec<T>(3 * d, wp.data(), -1.f, 1.f);
-            RandomVec<T>(d, ct_1.data(), -1.f, 1.f);
-            // x could be changed after compute, so copy to save src
-            std::vector<T> x(xsrc.size());
-            std::copy(xsrc.begin(), xsrc.end(), x.begin());
-            const T* ct_1_data = ct_1.data();
-            const T* wp_data = wp.data();
-            T* x_data = x.data();
-            T* checked_data = checked.data();
-            T* ct_ref_data = ct_ref.data();
-            T* ht_ref_data = ht_ref.data();
-            jit::lstm_t step;
-            step.gates = x_data;
-            step.ct_1 = ct_1_data;
-            step.ct = ct_ref_data;
-            step.ht = ht_ref_data;
-            if (use_peephole) {
-              step.wp = wp_data;
-              step.checked = checked_data;
-            }
-            ref(&step, &attr);
-            VLOG(10) << attr;
-            TestAllImpls<KT, jit::LSTMTuples<T>, PlaceType, std::vector<T>,
-                         std::vector<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>>(attr, xsrc, wp, ct_1, ct_ref, ht_ref,
-                                         attr);
-          }
-        }
-      }
-    }
-  }
-}
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelGRUTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
-  for (int d : test_sizes) {
-    for (auto& act_gate : all_acts) {
-      for (auto& act_cand : all_acts) {
-        const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate),
-                                   jit::to_kerneltype(act_cand));
-        auto ref = jit::GetRefer<KT, jit::GRUTuples<T>>();
-        EXPECT_TRUE(ref != nullptr);
-        std::vector<T> xsrc(3 * d), ht_1(d), ht_ref(d);
-        RandomVec<T>(3 * d, xsrc.data());
-        RandomVec<T>(d, ht_1.data());
-        // x could be changed after compute, so copy to save src
-        std::vector<T> x(xsrc.size());
-        std::copy(xsrc.begin(), xsrc.end(), x.begin());
-        const T* ht_1_data = ht_1.data();
-        T* x_data = x.data();
-        T* ht_ref_data = ht_ref.data();
-        jit::gru_t step;
-        step.gates = x_data;
-        step.ht_1 = ht_1_data;
-        step.ht = ht_ref_data;
-        ref(&step, &attr);
-        VLOG(10) << attr;
-        TestAllImpls<KT, jit::GRUTuples<T>, PlaceType, std::vector<T>,
-                     std::vector<T>, std::vector<T>>(attr, xsrc, ht_1, ht_ref,
-                                                     attr);
-      }
    }
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void TestKernelSeqPoolTuples() {
+void TestKernelSeqPool() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
  std::vector<jit::SeqPoolType> pool_types = {
      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
  auto test_sizes = TestSizes();
@@ -668,7 +589,7 @@ void TestKernelSeqPoolTuples() {
      jit::seq_pool_attr_t attr(w, type);
      for (int h : test_sizes) {
        attr.h = h;
-        auto ref = jit::GetRefer<KT, jit::SeqPoolTuples<T>>();
+        auto ref = jit::GetReferFunc<KernelTuple>();
        EXPECT_TRUE(ref != nullptr);
        std::vector<T> x(h * w), yref(w);
        RandomVec<T>(h * w, x.data());
@@ -676,16 +597,86 @@ void TestKernelSeqPoolTuples() {
        T* yref_data = yref.data();
        ref(x_data, yref_data, &attr);
        VLOG(10) << attr;
-        TestAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType, std::vector<T>,
+        auto verifier = [](const typename KernelTuple::func_type tgt,
-                     std::vector<T>>(attr, x, yref, attr);
+                           const std::vector<T>& x, const std::vector<T>& yref,
+                           const typename KernelTuple::attr_type& attr) {
+          EXPECT_TRUE(tgt != nullptr);
+          EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
+          int w = yref.size();
+          std::vector<T> y(w);
+          const T* x_data = x.data();
+          const T* yref_data = yref.data();
+          T* y_data = y.data();
+          tgt(x_data, y_data, &attr);
+          ExpectEQ<T>(y_data, yref_data, w);
+        };
+        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, x, yref, attr);
+      }
+    }
+  }
+}
+template <typename KernelTuple, typename PlaceType>
+void TestKernelEmbSeqPool() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  int64_t tbl_h = 1e4;
+  std::vector<jit::SeqPoolType> pool_types = {
+      jit::SeqPoolType::kSum};  // only support sum yet
+  auto test_sizes = TestSizes();
+  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
+  for (int tbl_w : test_sizes) {
+    std::vector<T> table(tbl_h * tbl_w);
+    RandomVec<T>(tbl_h * tbl_w, table.data());
+    const T* table_data = table.data();
+    for (auto type : pool_types) {
+      for (int idx_w : {1, 2, 10, 16}) {
+        for (int idx_h : {1, 2, 9, 13, 16}) {
+          auto ref = jit::GetReferFunc<KernelTuple>();
+          EXPECT_TRUE(ref != nullptr);
+          std::vector<int64_t> idx(idx_h * idx_w);
+          RandomVec<int64_t>(idx_h * idx_w, idx.data(), 0, tbl_h - 1);
+          int64_t out_w = tbl_w * idx_w;
+          std::vector<T> oref(out_w);
+          const int64_t* idx_data = idx.data();
+          T* o_data = oref.data();
+          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
+                                        type);
+          ref(table_data, idx_data, o_data, &attr);
+          auto verifier = [](const typename KernelTuple::func_type tgt,
+                             const std::vector<T>& table,
+                             const std::vector<int64_t>& idx,
+                             const std::vector<T>& oref,
+                             const typename KernelTuple::attr_type& attr) {
+            EXPECT_TRUE(tgt != nullptr);
+            EXPECT_EQ(table.size(), static_cast<size_t>(attr.table_height *
+                                                        attr.table_width));
+            EXPECT_EQ(idx.size(), static_cast<size_t>(attr.index_height *
+                                                      attr.index_width));
+            EXPECT_EQ(oref.size(),
+                      static_cast<size_t>(attr.table_width * attr.index_width));
+            const T* table_data = table.data();
+            const int64_t* idx_data = idx.data();
+            const T* oref_data = oref.data();
+            int o_w = oref.size();
+            std::vector<T> out(o_w);
+            T* o_data = out.data();
+            tgt(table_data, idx_data, o_data, &attr);
+            ExpectEQ<T>(o_data, oref_data, o_w);
+          };
+          TestAllImpls<KernelTuple, PlaceType>(attr, verifier, table, idx, oref,
+                                               attr);
+        }
      }
    }
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void TestKernelMatMulTuples() {
+void TestKernelMatMul() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
  auto last_acc = FLAGS_acc;
  // export MKL_CBWR=AVX would make MKL force to use AVX
  // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic
@@ -693,7 +684,7 @@ void TestKernelMatMulTuples() {
  for (int m : {1, 2, 3, 4}) {
    for (int n : {1, 2, 3, 4}) {
      for (int k : TestSizes()) {
-        auto ref = jit::GetRefer<KT, jit::MatMulTuples<T>>();
+        auto ref = jit::GetReferFunc<KernelTuple>();
        EXPECT_TRUE(ref != nullptr);
        std::vector<T> a(m * k), b(k * n), c(m * n);
        RandomVec<T>(m * k, a.data());
@@ -703,20 +694,36 @@ void TestKernelMatMulTuples() {
        T* c_data = c.data();
        const jit::matmul_attr_t attr{m, n, k};
        ref(a_data, b_data, c_data, &attr);
-        TestAllImpls<KT, jit::MatMulTuples<T>, PlaceType, std::vector<T>,
+        auto verifier = [](const typename KernelTuple::func_type tgt,
-                     std::vector<T>, std::vector<T>>(attr, a, b, c, attr);
+                           const std::vector<T>& a, const std::vector<T>& b,
+                           const std::vector<T>& cref,
+                           const typename KernelTuple::attr_type& attr) {
+          EXPECT_TRUE(tgt != nullptr);
+          EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
+          EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
+          EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
+          std::vector<T> c(cref.size());
+          const T* a_data = a.data();
+          const T* b_data = b.data();
+          const T* cref_data = cref.data();
+          T* c_data = c.data();
+          tgt(a_data, b_data, c_data, &attr);
+          ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
+        };
+        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, a, b, c, attr);
      }
    }
  }
  FLAGS_acc = last_acc;
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void TestKernelSoftmaxTuples() {
+void TestKernelSoftmax() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
  for (int bs : {1, 2, 10}) {
    for (int n : TestSizes()) {
-      auto ref = jit::GetRefer<KT, jit::SoftmaxTuples<T>>();
+      auto ref = jit::GetReferFunc<KernelTuple>();
      EXPECT_TRUE(ref != nullptr);
      std::vector<T> x(bs * n), y(bs * n);
      RandomVec<T>(bs * n, x.data());
@@ -730,51 +737,33 @@ void TestKernelSoftmaxTuples() {
      ref(xinp_data, xinp_data, n, bs);
      ExpectEQ<T>(xinp_data, y_data, n * bs);
-      TestAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType, std::vector<T>,
+      auto verifier = [](const typename KernelTuple::func_type tgt,
-                   std::vector<T>>(n, x, y, n, bs);
+                         const std::vector<T>& x, const std::vector<T>& yref,
-    }
+                         int n, int bs) {
-  }
+        EXPECT_TRUE(tgt != nullptr);
-}
+        EXPECT_EQ(yref.size(), x.size());
+        EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
-template <jit::KernelType KT, typename T, typename PlaceType>
+        const T* x_data = x.data();
-void TestKernelEmbSeqPoolTuples() {
+        const T* yref_data = yref.data();
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+        std::vector<T> ytgt(n * bs);
-  int64_t tbl_h = 1e4;
+        T* ytgt_data = ytgt.data();
-  std::vector<jit::SeqPoolType> pool_types = {
+        // test normal
-      jit::SeqPoolType::kSum};  // only support sum yet
+        tgt(x_data, ytgt_data, n, bs);
-  auto test_sizes = TestSizes();
+        ExpectEQ<T>(ytgt_data, yref_data, n * bs);
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
+        // test inplace x
-  for (int tbl_w : test_sizes) {
+        std::copy(x.begin(), x.end(), ytgt.begin());
-    std::vector<T> table(tbl_h * tbl_w);
+        tgt(ytgt_data, ytgt_data, n, bs);
-    RandomVec<T>(tbl_h * tbl_w, table.data());
+        ExpectEQ<T>(ytgt_data, yref_data, n * bs);
-    const T* table_data = table.data();
+      };
-    for (auto type : pool_types) {
+      TestAllImpls<KernelTuple, PlaceType>(n, verifier, x, y, n, bs);
-      for (int idx_w : {1, 2, 10, 16}) {
-        for (int idx_h : {1, 2, 9, 13, 16}) {
-          auto ref = jit::GetRefer<KT, jit::EmbSeqPoolTuples<T>>();
-          EXPECT_TRUE(ref != nullptr);
-          std::vector<int64_t> idx(idx_h * idx_w);
-          RandomVec<int64_t>(idx_h * idx_w, idx.data(), 0, tbl_h - 1);
-          int64_t out_w = tbl_w * idx_w;
-          std::vector<T> oref(out_w);
-          const int64_t* idx_data = idx.data();
-          T* o_data = oref.data();
-          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
-                                        type);
-          ref(table_data, idx_data, o_data, &attr);
-          TestAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType, std::vector<T>,
-                       std::vector<int64_t>, std::vector<T>>(attr, table, idx,
-                                                             oref, attr);
-        }
-      }
    }
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void TestKernelSgdTuples() {
+void TestKernelSgd() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
  const T lr = 0.1;
  auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                  const int64_t upper) -> std::vector<int64_t> {
@@ -802,7 +791,7 @@ void TestKernelSgdTuples() {
        RandomVec<T>(rows_size * grad_w, grad.data());
        const int64_t* rows_data = rows.data();
        const T* grad_data = grad.data();
-        auto ref = jit::GetRefer<KT, jit::SgdTuples<T>>();
+        auto ref = jit::GetReferFunc<KernelTuple>();
        EXPECT_TRUE(ref != nullptr);
        jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
        ref(&lr, param_data, grad_data, rows_data, out_data, &attr);
@@ -818,227 +807,488 @@ void TestKernelSgdTuples() {
                      grad_w);
        }
-        TestAllImpls<KT, jit::SgdTuples<T>, PlaceType, T, std::vector<T>,
+        auto verifier = [](
-                     std::vector<T>, std::vector<int64_t>, std::vector<T>>(
+            const typename KernelTuple::func_type tgt, const T lr,
-            attr, lr, param, grad, rows, param_out, attr);
+            const std::vector<T>& param, const std::vector<T>& grad,
-      }
+            const std::vector<int64_t>& rows, const std::vector<T>& oref,
-    }
+            const typename KernelTuple::attr_type& attr) {
-  }
-}
-template <jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelNCHW16CMulNCTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  const int n = 3, c = 16 * 4, h = 10, w = 10;
-  auto ref = jit::GetRefer<KT, jit::NCHW16CMulNCTuples<T>>();
-  EXPECT_TRUE(ref != nullptr);
-  int sz = n * c * h * w;
-  std::vector<T> x(sz), y(n * c), zref(sz);
-  std::vector<T> ztgt(sz), zjit(sz);
-  RandomVec<T>(sz, x.data());
-  RandomVec<T>(n * c, y.data());
-  const T* x_data = x.data();
-  const T* y_data = y.data();
-  T* zref_data = zref.data();
-  T* ztgt_data = ztgt.data();
-  T* zjit_data = zjit.data();
-  constexpr int simd_width = ZMM_FLOAT_BLOCK;
-  int C = c / simd_width;
-  auto tgt = jit::Get<KT, jit::NCHW16CMulNCTuples<T>, PlaceType>(0);
-  auto jitcode = jit::GetJitCode<KT, jit::NCHW16CMulNCTuples<T>, PlaceType>(0);
          EXPECT_TRUE(tgt != nullptr);
+          EXPECT_EQ(param.size(),
+                    static_cast<size_t>(attr.param_height * attr.param_width));
+          EXPECT_EQ(grad.size(),
+                    static_cast<size_t>(attr.grad_height * attr.grad_width));
+          EXPECT_EQ(rows.size(), static_cast<size_t>(attr.selected_rows_size));
+          EXPECT_EQ(param.size(), oref.size());
+          const T* param_data = param.data();
+          const T* grad_data = grad.data();
+          const int64_t* rows_data = rows.data();
+          const T* oref_data = oref.data();
-  if (std::is_same<T, float>::value &&
+          std::vector<T> out(oref.size());
-      paddle::platform::MayIUse(paddle::platform::avx512f)) {
+          T* o_data = out.data();
-    EXPECT_TRUE(jitcode != nullptr);
+          tgt(&lr, param_data, grad_data, rows_data, o_data, &attr);
-  }
+          // only the selected rows should be equal
-  for (int ni = 0; ni < n; ni++) {
+          for (size_t i = 0; i < rows.size(); ++i) {
-    for (int ci = 0; ci < C; ci++) {
+            ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
-      auto ptr_x =
+                        oref_data + rows[i] * attr.grad_width, attr.grad_width);
-          x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-      auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
-      auto ptr_zref =
-          zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-      auto ptr_ztgt =
-          ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-      ref(ptr_x, ptr_y, ptr_zref, h, w);
-      tgt(ptr_x, ptr_y, ptr_ztgt, h, w);
-      if (jitcode) {
-        auto ptr_zjit =
-            zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-        jitcode(ptr_x, ptr_y, ptr_zjit, h, w);
-      }
-    }
-  }
-  ExpectEQ<T>(ztgt_data, zref_data, sz);
-  if (jitcode) {
-    ExpectEQ<T>(zjit_data, zref_data, sz);
          }
-}
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelLayerNormTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  const T epsilon = 9.99999975e-06;
-  for (int n : {1, 2, 10}) {
-    for (int x_dim_0 : {1, 9, 17, 50}) {
-      int left = n * x_dim_0;
-      for (int x_dim_1 : TestSizes()) {
-        int right = x_dim_1;
-        auto ref = jit::GetRefer<KT, jit::LayerNormTuples<T>>();
-        EXPECT_TRUE(ref != nullptr);
-        int sz = left * right;
-        std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
-            outref(sz);
-        RandomVec<T>(sz, x.data());
-        RandomVec<T>(left, mean.data());
-        RandomVec<T>(left, var.data());
-        RandomVec<T>(right, scale.data());
-        RandomVec<T>(right, bias.data());
-        const T* scale_data = scale.data();
-        const T* bias_data = bias.data();
-        T* x_data = x.data();
-        T* mean_data = mean.data();
-        T* var_data = var.data();
-        T* outref_data = outref.data();
-        ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
-            left, epsilon, right);
-        TestAllImpls<KT, jit::LayerNormTuples<T>, PlaceType, std::vector<T>,
+          // inplace
-                     std::vector<T>, std::vector<T>, std::vector<T>,
+          std::copy(param.begin(), param.end(), out.begin());
-                     std::vector<T>, std::vector<T>, int, float>(
+          tgt(&lr, o_data, grad_data, rows_data, o_data, &attr);
-            right, x, outref, mean, var, scale, bias, left, epsilon, right);
+          for (size_t i = 0; i < rows.size(); ++i) {
-      }
+            ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
+                        oref_data + rows[i] * attr.grad_width, attr.grad_width);
          }
+        };
+        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, lr, param, grad,
+                                             rows, param_out, attr);
      }
-}
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
-void TestKernelCRFDecodingTuples() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  constexpr int state_trans_base_idx = 2;
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000));
-  for (int seq_len : {1, 11, 17, 50}) {
-    for (int tag_num : test_sizes) {
-      auto ref = jit::GetRefer<KT, jit::CRFDecodingTuples<T>>();
-      EXPECT_TRUE(ref != nullptr);
-      int x_sz = seq_len * tag_num;
-      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
-      std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
-      std::vector<int> trackref(x_sz);
-      RandomVec<T>(x_sz, x.data());
-      RandomVec<T>(w_sz, w.data());
-      ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
-          trackref.data(), tag_num);
-      TestAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType, int,
-                   std::vector<T>, std::vector<T>, std::vector<T>,
-                   std::vector<int>, int>(tag_num, seq_len, x, w, alpharef,
-                                          trackref, tag_num);
    }
  }
 }
-template <jit::KernelType KT, typename T, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
-void TestKernelVBroadcastTuples() {
+void TestKernelVBroadcast() {
-  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
  for (int w : TestSizes()) {
    std::vector<T> x(w);
    RandomVec<T>(w, x.data());
    const T* x_data = x.data();
    for (int64_t h : {1, 2, 6}) {
-      auto ref = jit::GetRefer<KT, jit::VBroadcastTuples<T>>();
+      auto ref = jit::GetReferFunc<KernelTuple>();
      EXPECT_TRUE(ref != nullptr);
      std::vector<T> y(w * h);
      T* y_data = y.data();
      ref(x_data, y_data, h, w);
-      TestAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType, std::vector<T>,
+      auto verifier = [](const typename KernelTuple::func_type tgt,
-                   std::vector<T>, int64_t>(static_cast<int64_t>(w), x, y, h,
+                         const std::vector<T>& x, const std::vector<T>& yref,
-                                            static_cast<int64_t>(w));
+                         const int64_t& h,
+                         const typename KernelTuple::attr_type& attr) {
+        EXPECT_TRUE(tgt != nullptr);
+        EXPECT_EQ(x.size(), static_cast<size_t>(attr));
+        EXPECT_EQ(yref.size(), x.size() * h);
+        std::vector<T> y(yref.size());
+        const T* x_data = x.data();
+        const T* yref_data = yref.data();
+        T* y_data = y.data();
+        tgt(x_data, y_data, h, attr);
+        ExpectEQ<T>(y_data, yref_data, yref.size());
+      };
+      TestAllImpls<KernelTuple, PlaceType>(static_cast<int64_t>(w), verifier, x,
+                                           y, h, static_cast<int64_t>(w));
    }
  }
 }
-#define TEST_CPU_KERNEL(test_tuple, kernel_type)                 \
+// test pool
-  TEST(JITKernel, kernel_type) {                                 \
+TEST(JITKernel_pool, jitcreator) {
-    TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
+  const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators();
-    TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
+#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
+  EXPECT_EQ(jitcreators.size(), 0UL);
+#else
+  EXPECT_EQ(jitcreators.size(), 25UL);
+#endif
+}
+TEST(JITKernel_pool, jitpool) {
+  // jitpool is related with attr
+  const auto& kers = jit::JitCodePool<jit::kVAdd>().Instance().AllKernels();
+  EXPECT_EQ(kers.size(), 0UL);
+  jit::GetAllCandidateKernels<jit::VAddTuple<float>, CPUPlace>(3);
+// after call GetAllCandidateKernels, it will create jitcode Automatically
+#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
+  EXPECT_EQ(kers.size(), 0UL);
+#else
+  EXPECT_EQ(kers.size(), 1UL);
+#endif
+}
+TEST(JITKernel_pool, more) {
+  const auto& kers = jit::KernelPool::Instance().AllKernels();
+#if defined(__APPLE__) || defined(__OSX__)
+  EXPECT_EQ(kers.size(), 10UL);
+#else
+#ifdef PADDLE_WITH_MKLML
+  EXPECT_EQ(kers.size(), 21UL);
+#else
+  EXPECT_EQ(kers.size(), 8UL);
+#endif
+#endif
+}
+TEST(JITKernel_pool, refer) {
+  const auto& kers = jit::ReferKernelPool::Instance().AllKernels();
+  EXPECT_EQ(kers.size(), 29UL);
+}
+// test helper
+TEST(JITKernel_helper, GetAllCandidateKernels) {
+  auto fp_kers =
+      jit::GetAllCandidateKernels<jit::VExpTuple<float>, CPUPlace>(10);
+#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
+  EXPECT_GE(fp_kers.size(), 1UL);  // refer
+#else
+#ifdef PADDLE_WITH_MKLML
+  EXPECT_GE(fp_kers.size(), 3UL);  // jitcode, mkl, refer
+#else
+  EXPECT_GE(fp_kers.size(), 2UL);  // jitcode, refer
+#endif
+#endif
+  auto db_kers =
+      jit::GetAllCandidateKernels<jit::VExpTuple<double>, CPUPlace>(10);
+#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
+  EXPECT_GE(db_kers.size(), 1UL);  // refer
+#else
+#ifdef PADDLE_WITH_MKLML
+  EXPECT_GE(db_kers.size(), 2UL);  // mkl, refer
+#else
+  EXPECT_GE(db_kers.size(), 1UL);  // refer
+#endif
+#endif
+}
+TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) {
+  auto fp_kers =
+      jit::GetAllCandidateFuncsWithTypes<jit::VExpTuple<float>, CPUPlace>(10);
+#if defined(__APPLE__) || defined(__OSX__)
+  EXPECT_GE(fp_kers.size(), 1UL);  // refer
+#else
+#if !defined(PADDLE_WITH_MKLML) || defined(_WIN32)
+  EXPECT_GE(fp_kers.size(), 2UL);  // jitcode/mkl, refer
+#else
+  EXPECT_GE(fp_kers.size(), 3UL);  // jitcode, mkl, refer
+#endif
+#endif
+  auto db_kers =
+      jit::GetAllCandidateFuncsWithTypes<jit::VExpTuple<double>, CPUPlace>(10);
+#if defined(__APPLE__) || defined(__OSX__) || !defined(PADDLE_WITH_MKLML)
+  EXPECT_GE(db_kers.size(), 1UL);  // refer
+#else
+  EXPECT_GE(db_kers.size(), 2UL);  // mkl, refer
+#endif
+}
+TEST(JITKernel_helper, KernelFuncs) {
+  auto f1 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache().At(3);
+  auto f2 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache()[3];
+  EXPECT_TRUE(f1 != nullptr);
+  EXPECT_TRUE(f1 == f2);
+  auto f3 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache()[5];
+#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
+  EXPECT_TRUE(f2 == f3);
+#else
+  EXPECT_TRUE(f2 != f3);
+#endif
+}
+TEST(JITKernel_helper, GetAllCandidateFuncs) {
+  auto funcs = jit::GetAllCandidateFuncs<jit::VExpTuple<float>, CPUPlace>(10);
+  auto kers = jit::GetAllCandidateKernels<jit::VExpTuple<float>, CPUPlace>(10);
+  EXPECT_EQ(funcs.size(), kers.size());
+  std::vector<float> x(10), tgt(10);
+  RandomVec<float>(10, x.data());
+  auto best = jit::GetDefaultBestFunc<jit::VExpTuple<float>, CPUPlace>(10);
+  best(x.data(), tgt.data(), 10);
+  for (auto f : funcs) {
+    std::vector<float> y(10);
+    f(x.data(), y.data(), 10);
+    ExpectEQ<float>(y.data(), tgt.data(), 10);
  }
+}
-TEST_CPU_KERNEL(XYZNTuples, kVMul);
+TEST(JITKernel_helper, pack_weights) {
-TEST_CPU_KERNEL(XYZNTuples, kVAdd);
+  const int N = 8 * 60, K = 2;
-TEST_CPU_KERNEL(XYZNTuples, kVAddRelu);
+  float src[K][N], yref[K][N], y[K * N];
-TEST_CPU_KERNEL(XYZNTuples, kVSub);
+  float* x = &(src[0][0]);
+  float* ref = &(yref[0][0]);
+  for (int i = 0; i < N * K; ++i) {
+    *(x + i) = static_cast<float>(i);
+  }
+  int block = 0;
+  std::vector<int> groups;
+  if (paddle::platform::MayIUse(paddle::platform::avx512f)) {
+    block = ZMM_FLOAT_BLOCK;
+    groups.push_back(30);
+  } else {
+    block = YMM_FLOAT_BLOCK;
+    groups.insert(groups.end(), {14, 14, 14, 14, 4});
+  }
-TEST_CPU_KERNEL(AXYNTuples, kVScal);
+  int offset = 0;
-TEST_CPU_KERNEL(AXYNTuples, kVAddBias);
+  int acc = 0;
+  for (int g : groups) {
+    g = g * block;
+    for (int k = 0; k < K; ++k) {
+      for (int i = 0; i < g; ++i) {
+        *(ref + offset) = src[k][i + acc];
+        offset++;
+      }
+    }
+    acc += g;
+  }
-TEST_CPU_KERNEL(XRNTuples, kHMax);
+  jit::pack_weights<float>(x, y, N, K);
-TEST_CPU_KERNEL(XRNTuples, kHSum);
+  ExpectEQ<float>(y, ref, N * K);
+}
-TEST_CPU_KERNEL(XYNTuples, kVRelu);
+TEST(JITKernel_helper, attr) {
-TEST_CPU_KERNEL(XYNTuples, kVIdentity);
+  std::ostringstream out;
-TEST_CPU_KERNEL(XYNTuples, kVSquare);
+  // KernelTypes
-TEST_CPU_KERNEL(XYNTuples, kVExp);
+  out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding)
-TEST_CPU_KERNEL(XYNTuples, kVSigmoid);
+      << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1)
-TEST_CPU_KERNEL(XYNTuples, kVTanh);
+      << jit::to_string(jit::kGRUHtPart1) << jit::to_string(jit::kGRUHtPart2)
-TEST_CPU_KERNEL(XYNTuples, kVCopy);
+      << jit::to_string(jit::kHSum) << jit::to_string(jit::kHMax)
+      << jit::to_string(jit::kLSTMCtHt) << jit::to_string(jit::kLSTMC1H1)
+      << jit::to_string(jit::kLayerNorm) << jit::to_string(jit::kMatMul)
+      << jit::to_string(jit::kNCHW16CMulNC) << jit::to_string(jit::kSeqPool)
+      << jit::to_string(jit::kSoftmax) << jit::to_string(jit::kVAdd)
+      << jit::to_string(jit::kVAddBias) << jit::to_string(jit::kVAddRelu)
+      << jit::to_string(jit::kVBroadcast) << jit::to_string(jit::kVCopy)
+      << jit::to_string(jit::kVExp) << jit::to_string(jit::kVIdentity)
+      << jit::to_string(jit::kVMul) << jit::to_string(jit::kVRelu)
+      << jit::to_string(jit::kVScal) << jit::to_string(jit::kSgd)
+      << jit::to_string(jit::kVSigmoid) << jit::to_string(jit::kVSquare)
+      << jit::to_string(jit::kVSub) << jit::to_string(jit::kVTanh);
+  EXPECT_EQ(out.str().size(), 234);
+  // SeqPoolTypes
+  out.str("");
+  out << jit::to_string(jit::kSum) << jit::to_string(jit::kAvg)
+      << jit::to_string(jit::kSqrt);
+  EXPECT_EQ(out.str().size(), 13);
+  EXPECT_EQ(jit::to_kerneltype("relu"), jit::kVRelu);
+  EXPECT_EQ(jit::to_kerneltype("Identity"), jit::kVIdentity);
+  EXPECT_EQ(jit::to_kerneltype("VEXP"), jit::kVExp);
+  EXPECT_EQ(jit::to_kerneltype("SigmoiD"), jit::kVSigmoid);
+  EXPECT_EQ(jit::to_kerneltype("VTanh"), jit::kVTanh);
+  out.str("");
+  out << jit::lstm_attr_t(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
+  EXPECT_EQ(out.str().size(), 89);
+  out.str("");
+  out << jit::gru_attr_t(8, jit::kVIdentity, jit::kVSigmoid);
+  EXPECT_EQ(out.str().size(), 52);
+  out.str("");
+  out << jit::seq_pool_attr_t(8, jit::SeqPoolType::kSum);
+  EXPECT_EQ(out.str().size(), 44);
+  out.str("");
+  out << jit::emb_seq_pool_attr_t(1, 2, 3, 4, 5, jit::SeqPoolType::kAvg);
+  EXPECT_EQ(out.str().size(), 93);
+  out.str("");
+  out << jit::sgd_attr_t(1, 2, 3, 4, 5);
+  EXPECT_EQ(out.str().size(), 81);
+  out.str("");
+  out << jit::matmul_attr_t(1, 2, 3);
+  EXPECT_EQ(out.str().size(), 14);
+}
-TEST_CPU_KERNEL(LSTMTuples, kLSTMCtHt);
+// test keys
-TEST_CPU_KERNEL(LSTMTuples, kLSTMC1H1);
+TEST(JITKernel_key, int) {
+  EXPECT_TRUE(jit::JitCodeKey<int>(2) == jit::JitCodeKey<int>(2));
+  EXPECT_TRUE(jit::JitCodeKey<int>(2) == jit::JitCodeKey<int64_t>(2));
+  EXPECT_TRUE(jit::JitCodeKey<int>(2) != jit::JitCodeKey<int>(3));
+}
-TEST_CPU_KERNEL(GRUTuples, kGRUH1);
+TEST(JITKernel_key, gru) {
-TEST_CPU_KERNEL(GRUTuples, kGRUHtPart1);
+  jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh);
-TEST_CPU_KERNEL(GRUTuples, kGRUHtPart2);
+  jit::gru_attr_t attr2(8, jit::kVSigmoid, jit::kVTanh);
+  jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh);
+  jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity);
+  jit::gru_attr_t attr5(9, jit::kVTanh, jit::kVIdentity);
-TEST_CPU_KERNEL(NCHW16CMulNCTuples, kNCHW16CMulNC);
+  auto key1 = jit::JitCodeKey<jit::gru_attr_t>(attr1);
+  auto key2 = jit::JitCodeKey<jit::gru_attr_t>(attr2);
+  auto key3 = jit::JitCodeKey<jit::gru_attr_t>(attr3);
+  auto key4 = jit::JitCodeKey<jit::gru_attr_t>(attr4);
+  auto key5 = jit::JitCodeKey<jit::gru_attr_t>(attr5);
-TEST_CPU_KERNEL(SeqPoolTuples, kSeqPool);
+  EXPECT_TRUE(key1 == key2);
-TEST_CPU_KERNEL(MatMulTuples, kMatMul);
+  EXPECT_TRUE(key2 != key3);
-TEST_CPU_KERNEL(SoftmaxTuples, kSoftmax);
+  EXPECT_TRUE(key2 != key4);
-TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool);
+  EXPECT_TRUE(key2 != key5);
-TEST_CPU_KERNEL(SgdTuples, kSgd);
+  EXPECT_TRUE(key3 != key4);
-TEST_CPU_KERNEL(LayerNormTuples, kLayerNorm);
+  EXPECT_TRUE(key3 != key5);
-TEST_CPU_KERNEL(CRFDecodingTuples, kCRFDecoding);
+  EXPECT_TRUE(key4 != key5);
-TEST_CPU_KERNEL(VBroadcastTuples, kVBroadcast);
+}
 TEST(JITKernel_key, lstm) {
  jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
-  jit::lstm_attr_t attr2(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
+  jit::lstm_attr_t attr2(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
  jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
  jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh);
+  jit::lstm_attr_t attr5(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true);
+  jit::lstm_attr_t attr6(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true);
  auto key1 = jit::JitCodeKey<jit::lstm_attr_t>(attr1);
  auto key2 = jit::JitCodeKey<jit::lstm_attr_t>(attr2);
  auto key3 = jit::JitCodeKey<jit::lstm_attr_t>(attr3);
  auto key4 = jit::JitCodeKey<jit::lstm_attr_t>(attr4);
+  auto key5 = jit::JitCodeKey<jit::lstm_attr_t>(attr5);
+  auto key6 = jit::JitCodeKey<jit::lstm_attr_t>(attr6);
-  EXPECT_TRUE(key1 != key2);
+  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 == key3);
+  EXPECT_TRUE(key2 != key3);
+  EXPECT_TRUE(key2 != key4);
+  EXPECT_TRUE(key2 != key5);
  EXPECT_TRUE(key3 != key4);
+  EXPECT_TRUE(key3 != key5);
+  EXPECT_TRUE(key4 != key5);
+  EXPECT_TRUE(key5 == key6);
 }
-TEST(JITKernel_key, gru) {
+TEST(JITKernel_key, seq_pool) {
-  jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh);
+  jit::seq_pool_attr_t attr1(2, jit::SeqPoolType::kSum, 1);
-  jit::gru_attr_t attr2(9, jit::kVSigmoid, jit::kVTanh);
+  jit::seq_pool_attr_t attr2(2, jit::SeqPoolType::kSum, 3);
-  jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh);
+  jit::seq_pool_attr_t attr3(3, jit::SeqPoolType::kSum, 3);
-  jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity);
+  jit::seq_pool_attr_t attr4(3, jit::SeqPoolType::kAvg, 3);
-  auto key1 = jit::JitCodeKey<jit::gru_attr_t>(attr1);
+  auto key1 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::gru_attr_t>(attr2);
+  auto key2 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::gru_attr_t>(attr3);
+  auto key3 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::gru_attr_t>(attr4);
+  auto key4 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr4);
+  EXPECT_TRUE(key1 == key2);
+  EXPECT_TRUE(key2 != key3);
+  EXPECT_TRUE(key2 != key4);
+  EXPECT_TRUE(key3 != key4);
+}
+TEST(JITKernel_key, matmul) {
+  jit::matmul_attr_t attr1(1, 2, 3);
+  jit::matmul_attr_t attr2(1, 2, 3);
+  jit::matmul_attr_t attr3(1, 3, 3);
+  jit::matmul_attr_t attr4(2, 3, 4);
+  auto key1 = jit::JitCodeKey<jit::matmul_attr_t>(attr1);
+  auto key2 = jit::JitCodeKey<jit::matmul_attr_t>(attr2);
+  auto key3 = jit::JitCodeKey<jit::matmul_attr_t>(attr3);
+  auto key4 = jit::JitCodeKey<jit::matmul_attr_t>(attr4);
+  EXPECT_TRUE(key1 == key2);
+  EXPECT_TRUE(key2 != key3);
+  EXPECT_TRUE(key2 != key4);
+  EXPECT_TRUE(key3 != key4);
+}
+TEST(JITKernel_key, emb_seq_pool) {
+  jit::emb_seq_pool_attr_t attr1(1, 2, 3, 4, 5, jit::SeqPoolType::kSum);
+  jit::emb_seq_pool_attr_t attr2(1, 2, 3, 4, 5, jit::SeqPoolType::kSum);
+  jit::emb_seq_pool_attr_t attr3(10, 2, 9, 8, 7, jit::SeqPoolType::kAvg);
+  jit::emb_seq_pool_attr_t attr4(10, 3, 9, 8, 7, jit::SeqPoolType::kSum);
+  jit::emb_seq_pool_attr_t attr5(1, 6, 3, 4, 5, jit::SeqPoolType::kSum);
+  auto key1 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr1);
+  auto key2 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr2);
+  auto key3 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr3);
+  auto key4 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr4);
+  auto key5 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr5);
+  EXPECT_TRUE(key1 == key2);
+  EXPECT_TRUE(key2 == key3);
+  EXPECT_TRUE(key2 != key4);
+  EXPECT_TRUE(key2 != key5);
+  EXPECT_TRUE(key4 != key5);
+}
+TEST(JITKernel_key, sgd) {
+  jit::sgd_attr_t attr1(1, 2, 3, 4, 5);
+  jit::sgd_attr_t attr2(1, 2, 3, 4, 5);
+  jit::sgd_attr_t attr3(9, 8, 7, 4, 6);
+  jit::sgd_attr_t attr4(1, 2, 3, 6, 5);
+  jit::sgd_attr_t attr5(10, 9, 8, 7, 6);
+  auto key1 = jit::JitCodeKey<jit::sgd_attr_t>(attr1);
+  auto key2 = jit::JitCodeKey<jit::sgd_attr_t>(attr2);
+  auto key3 = jit::JitCodeKey<jit::sgd_attr_t>(attr3);
+  auto key4 = jit::JitCodeKey<jit::sgd_attr_t>(attr4);
+  auto key5 = jit::JitCodeKey<jit::sgd_attr_t>(attr5);
-  EXPECT_TRUE(key1 != key2);
+  EXPECT_TRUE(key1 == key2);
  EXPECT_TRUE(key2 == key3);
  EXPECT_TRUE(key3 != key4);
+  EXPECT_TRUE(key3 != key5);
+  EXPECT_TRUE(key4 != key5);
 }
-// TODO(TJ): add more test about key and pool
+// test kernerls
+#define TestKernelVMul TestKernelXYZN
+#define TestKernelVAdd TestKernelXYZN
+#define TestKernelVAddRelu TestKernelXYZN
+#define TestKernelVSub TestKernelXYZN
+#define TestKernelVScal TestKernelAXYN
+#define TestKernelVAddBias TestKernelAXYN
+#define TestKernelVRelu TestKernelXYN
+#define TestKernelVIdentity TestKernelXYN
+#define TestKernelVSquare TestKernelXYN
+#define TestKernelVExp TestKernelXYN
+#define TestKernelVSigmoid TestKernelXYN
+#define TestKernelVTanh TestKernelXYN
+#define TestKernelVCopy TestKernelXYN
+#define TestKernelHMax TestKernelXRN
+#define TestKernelHSum TestKernelXRN
+#define TestKernelLSTMCtHt TestKernelLSTM
+#define TestKernelLSTMC1H1 TestKernelLSTM
+#define TestKernelGRUH1 TestKernelGRU
+#define TestKernelGRUHtPart1 TestKernelGRU
+#define TestKernelGRUHtPart2 TestKernelGRU
+#define TEST_CPU_KERNEL(kernel_type)                                      \
+  TEST(JITKernel, kernel_type) {                                          \
+    TestKernel##kernel_type<jit::kernel_type##Tuple<float>, CPUPlace>();  \
+    TestKernel##kernel_type<jit::kernel_type##Tuple<double>, CPUPlace>(); \
+  }
+TEST_CPU_KERNEL(VMul);
+TEST_CPU_KERNEL(VAdd);
+TEST_CPU_KERNEL(VAddRelu);
+TEST_CPU_KERNEL(VSub);
+TEST_CPU_KERNEL(VScal);
+TEST_CPU_KERNEL(VAddBias);
+TEST_CPU_KERNEL(VRelu);
+TEST_CPU_KERNEL(VIdentity);
+TEST_CPU_KERNEL(VSquare);
+TEST_CPU_KERNEL(VExp);
+TEST_CPU_KERNEL(VSigmoid);
+TEST_CPU_KERNEL(VTanh);
+TEST_CPU_KERNEL(VCopy);
+TEST_CPU_KERNEL(HMax);
+TEST_CPU_KERNEL(HSum);
+TEST_CPU_KERNEL(LSTMCtHt);
+TEST_CPU_KERNEL(LSTMC1H1);
+TEST_CPU_KERNEL(GRUH1);
+TEST_CPU_KERNEL(GRUHtPart1);
+TEST_CPU_KERNEL(GRUHtPart2);
+TEST_CPU_KERNEL(NCHW16CMulNC);
+TEST_CPU_KERNEL(LayerNorm);
+TEST_CPU_KERNEL(CRFDecoding);
+TEST_CPU_KERNEL(SeqPool);
+TEST_CPU_KERNEL(EmbSeqPool);
+TEST_CPU_KERNEL(MatMul);
+TEST_CPU_KERNEL(Softmax);
+TEST_CPU_KERNEL(Sgd);
+TEST_CPU_KERNEL(VBroadcast);
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -230,8 +230,8 @@ class LayerNormKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_EQ(bias->numel(), right);
    auto ker =
-        jit::Get<jit::kLayerNorm, jit::LayerNormTuples<T>, platform::CPUPlace>(
+        jit::KernelFuncs<jit::LayerNormTuple<T>, platform::CPUPlace>::Cache()
-            right);
+            .At(right);
    ker(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(),
        scale->data<T>(), bias->data<T>(), static_cast<int>(left),
        static_cast<const float>(epsilon), right);

--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -30,17 +30,16 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
    return;
  }
  if (relu) {
-    auto compute = jit::KernelFuncs<jit::kVAddRelu, jit::XYZNTuples<T>,
+    auto compute =
-                                    platform::CPUPlace>::Cache()
+        jit::KernelFuncs<jit::VAddReluTuple<T>, platform::CPUPlace>::Cache().At(
-                       .At(N);
+            N);
    for (int i = 0; i < M; i++) {
      T* dst = Y + i * N;
      compute(B, dst, dst, N);
    }
  } else {
-    auto compute = jit::KernelFuncs<jit::kVAdd, jit::XYZNTuples<T>,
+    auto compute =
-                                    platform::CPUPlace>::Cache()
+        jit::KernelFuncs<jit::VAddTuple<T>, platform::CPUPlace>::Cache().At(N);
-                       .At(N);
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif

--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -256,8 +256,8 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
          static_cast<int>(input.numel() / input.dims()[0]),
          jit::SeqPoolType::kSum);
      auto seqpool =
-          jit::Get<jit::kSeqPool, jit::SeqPoolTuples<T>, platform::CPUPlace>(
+          jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache()
-              attr);
+              .At(attr);
      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
        attr.h = static_cast<int>(lod[i + 1] - lod[i]);
        seqpool(src, dst, &attr);

--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -82,8 +82,7 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
    const int kClassDim = 1;
    // 2D data. Batch x C
    auto compute_softmax =
-        jit::KernelFuncs<jit::kSoftmax, jit::SoftmaxTuples<float>,
+        jit::KernelFuncs<jit::SoftmaxTuple<float>, platform::CPUPlace>::Cache()
-                         platform::CPUPlace>::Cache()
            .At(in_dims[kClassDim]);
    compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]);
  }

--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -48,7 +48,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
        T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
        auto sgd =
-            jit::Get<jit::kSgd, jit::SgdTuples<T>, platform::CPUPlace>(attr);
+            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
+                attr);
        sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
      } else if (grad_var->IsType<framework::SelectedRows>()) {
        // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
@@ -82,7 +83,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
        PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width);
        auto sgd =
-            jit::Get<jit::kSgd, jit::SgdTuples<T>, platform::CPUPlace>(attr);
+            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
+                attr);
        sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
      } else {
        PADDLE_THROW("Unsupported Variable Type of Grad");