diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index b39ce280939515ec8f4fa3b443ff4332074825fd..74d6a87247821eb1d17cc97b8d8b4bcf1c832f79 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -22,10 +22,54 @@
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/port.h"
+#include "paddle/fluid/platform/variant.h"  // for UNUSED
 
 DEFINE_int32(burning, 10, "Burning times.");
 DEFINE_int32(repeat, 3000, "Repeat times.");
 DEFINE_int32(max_size, 1000, "The Max size would be tested.");
+DEFINE_string(filter, "", "The Benchmark name would be run.");
+
+class BenchJITKernel {
+ public:
+  BenchJITKernel() = default;
+  virtual ~BenchJITKernel() = default;
+  virtual void Run() = 0;
+  virtual const char* Name() = 0;
+  virtual const char* Dtype() = 0;
+  virtual const char* Place() = 0;
+};
+
+static std::vector<BenchJITKernel*> g_all_benchmarks;
+
+BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
+  g_all_benchmarks.push_back(b);
+  return b;
+}
+
+#define BENCH_JITKERNEL(name, dtype, place)                                    \
+  class BenchJITKernel_##name##_##dtype##_##place##_ : public BenchJITKernel { \
+   public:                                                                     \
+    const char* Name() override { return #name; }                              \
+    const char* Dtype() override { return #dtype; }                            \
+    const char* Place() override { return #place; }                            \
+    void Run() override;                                                       \
+  };                                                                           \
+  static auto inserted_##name##_##dtype##_##place##_ UNUSED =                  \
+      InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_());     \
+  void BenchJITKernel_##name##_##dtype##_##place##_::Run()
+
+#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU)
+
+void RUN_ALL_BENCHMARK() {
+  for (auto p : g_all_benchmarks) {
+    if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) {
+      continue;
+    }
+    LOG(INFO) << "Benchmark " << p->Name() << "." << p->Dtype() << "."
+              << p->Place();
+    p->Run();
+  }
+}
 
 template <typename T>
 void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
@@ -228,49 +272,70 @@ void BenchMatMulKernel() {
   }
 }
 
+using T = float;
+using PlaceType = paddle::platform::CPUPlace;
+
+// xyzn
+BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, PlaceType>(); }
+
+// axyn
+BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, PlaceType>(); }
+
+// xyn
+BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, PlaceType>(); }
+
+// lstm and peephole
+BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, PlaceType>(); }
+
+// gru functions
+BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kGRUHtPart1) {
+  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
+}
+
+BENCH_FP32_CPU(kGRUHtPart2) {
+  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
+}
+
+// seq pool function
+BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>(); }
+
+// matmul
+BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, PlaceType>(); }
+
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:
 //     --burning: the burning time before count
 //     --repeat: the repeat times
 //     --max_size: the max size would be tested
+//     --filter: the bench name would be run
 int main(int argc, char* argv[]) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   google::InitGoogleLogging(argv[0]);
   LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat
             << " times.";
-  using T = float;
-  using PlaceType = paddle::platform::CPUPlace;
-  // xyzn
-  BenchXYZNKernel<jit::kVMul, T, PlaceType>();
-  BenchXYZNKernel<jit::kVAdd, T, PlaceType>();
-  BenchXYZNKernel<jit::kVAddRelu, T, PlaceType>();
-  BenchXYZNKernel<jit::kVSub, T, PlaceType>();
-
-  // axyn
-  BenchAXYNKernel<jit::kVScal, T, PlaceType>();
-  BenchAXYNKernel<jit::kVAddBias, T, PlaceType>();
-
-  // xyn
-  BenchXYNKernel<jit::kVRelu, T, PlaceType>();
-  BenchXYNKernel<jit::kVIdentity, T, PlaceType>();
-  BenchXYNKernel<jit::kVSquare, T, PlaceType>();
-  BenchXYNKernel<jit::kVExp, T, PlaceType>();
-  BenchXYNKernel<jit::kVSigmoid, T, PlaceType>();
-  BenchXYNKernel<jit::kVTanh, T, PlaceType>();
-
-  // lstm and peephole
-  BenchLSTMKernel<jit::kLSTMCtHt, T, PlaceType>();
-  BenchLSTMKernel<jit::kLSTMC1H1, T, PlaceType>();
-
-  // gru functions
-  BenchGRUKernel<jit::kGRUH1, T, PlaceType>();
-  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
-  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
-
-  // seq pool function
-  BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>();
 
-  // matmul
-  BenchMatMulKernel<jit::kMatMul, T, PlaceType>();
+  RUN_ALL_BENCHMARK();
 }
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index f4415a54ca9678c75038a820bb5d212e61593ec7..68a79b6314e4cf86f5b715b9c6694924126b12da 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -22,6 +22,8 @@
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
+static double acc = 1e-5;
+
 template <typename T>
 void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
                const T upper = static_cast<T>(20.f)) {
@@ -37,7 +39,7 @@ template <typename T>
 void ExpectEQ(const T* target, const T* refer, int n) {
   if (std::is_floating_point<T>::value) {
     for (int i = 0; i < n; ++i) {
-      EXPECT_NEAR(target[i], refer[i], 1e-5);
+      EXPECT_NEAR(target[i], refer[i], acc);
     }
   } else {
     for (int i = 0; i < n; ++i) {
@@ -62,7 +64,9 @@ namespace jit = paddle::operators::jit;
 
 template <typename KernelTuples, typename... Args>
 struct TestFuncWithRefer {
-  void operator()(const typename KernelTuples::func_type tgt, Args... args) {}
+  void operator()(const typename KernelTuples::func_type tgt, Args... args) {
+    LOG(FATAL) << "Should specify this function.";
+  }
 };
 
 template <typename T>
@@ -140,7 +144,8 @@ struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
 
 template <typename T>
 struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>, std::vector<T>, std::vector<T>> {
+                         std::vector<T>, std::vector<T>, std::vector<T>,
+                         typename jit::LSTMTuples<T>::attr_type> {
   void operator()(const typename jit::LSTMTuples<T>::func_type tgt,
                   const std::vector<T>& xsrc, const std::vector<T>& wp,
                   const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
@@ -185,7 +190,8 @@ struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
 
 template <typename T>
 struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>> {
+                         std::vector<T>,
+                         typename jit::GRUTuples<T>::attr_type> {
   void operator()(const typename jit::GRUTuples<T>::func_type tgt,
                   const std::vector<T>& xsrc, const std::vector<T>& ht_1,
                   const std::vector<T>& ht_ref,
@@ -212,8 +218,8 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
 };
 
 template <typename T>
-struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>,
-                         std::vector<T>> {
+struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
+                         typename jit::SeqPoolTuples<T>::attr_type> {
   void operator()(const typename jit::SeqPoolTuples<T>::func_type tgt,
                   const std::vector<T>& x, const std::vector<T>& yref,
                   const typename jit::SeqPoolTuples<T>::attr_type& attr) {
@@ -385,8 +391,8 @@ void TestLSTMKernel() {
             std::vector<T> xsrc(4 * d), wp(3 * d), ct_1(d);
             std::vector<T> ct_ref(d), ht_ref(d), checked(2 * d);
             RandomVec<T>(4 * d, xsrc.data(), -2.f, 2.f);
-            RandomVec<T>(3 * d, wp.data(), -2.f, 2.f);
-            RandomVec<T>(d, ct_1.data(), -2.f, 2.f);
+            RandomVec<T>(3 * d, wp.data(), -1.f, 1.f);
+            RandomVec<T>(d, ct_1.data(), -1.f, 1.f);
             // x could be changed after compute, so copy to save src
             std::vector<T> x(xsrc.size());
             std::copy(xsrc.begin(), xsrc.end(), x.begin());
@@ -481,14 +487,17 @@ void TestSeqPoolKernel() {
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void TestMatMulKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  auto last_acc = acc;
+  // TODO(intel): this should be acc issue of MKL
+  acc = 1e-3;
   for (int m : {1, 2, 3, 4}) {
     for (int n : {1, 2, 3, 4}) {
       for (int k : TestSizes()) {
         auto ref = jit::GetRefer<KT, jit::MatMulTuples<T>>();
         EXPECT_TRUE(ref != nullptr);
         std::vector<T> a(m * k), b(k * n), c(m * n);
-        RandomVec<T>(m * k, a.data(), -0.2f, 0.2f);
-        RandomVec<T>(k * n, b.data(), -0.2f, 0.2f);
+        RandomVec<T>(m * k, a.data(), -2.f, 2.f);
+        RandomVec<T>(k * n, b.data(), -2.f, 2.f);
         const T* a_data = a.data();
         const T* b_data = b.data();
         T* c_data = c.data();
@@ -498,6 +507,7 @@ void TestMatMulKernel() {
       }
     }
   }
+  acc = last_acc;
 }
 
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>