diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index a35ee8a09ed5ddcc4ac465d200b84358fa65b2f3..e9e2a3b1f5c1c00bb2e95b6171ecd09bfe7a0d21 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -79,17 +79,17 @@ void FusionRepeatedFCReluOpMaker::Make() {
 }
 
 template <typename T>
-static void fc_relu(const T* x, const T* w, const T* b, T* y, int m, int n,
-                    int k) {
+static void fc_relu(const T* x, const T* w, const T* b, T* y,
+                    const jit::matmul_attr_t& attr) {
   auto matmul =
-      jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k);
+      jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
   auto addbias_relu =
-      jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(n);
-  matmul(x, w, y, m, n, k);
+      jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(attr.n);
+  matmul(x, w, y, &attr);
   T* dst = y;
-  for (int i = 0; i < m; ++i) {
-    addbias_relu(b, dst, dst, n);
-    dst += n;
+  for (int i = 0; i < attr.m; ++i) {
+    addbias_relu(b, dst, dst, attr.n);
+    dst += attr.n;
   }
 }
 
@@ -107,32 +107,33 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
 
     auto i_dims = in->dims();
     auto w_dims = weights[0]->dims();
-    int m = i_dims[0];
-    int n = w_dims[1];
-    int k = w_dims[0];
-    relus[0]->Resize({m, n});
+    jit::matmul_attr_t attr;
+    attr.m = i_dims[0];
+    attr.n = w_dims[1];
+    attr.k = w_dims[0];
+    relus[0]->Resize({attr.m, attr.n});
     fc_relu(in->data<T>(), weights[0]->data<T>(), biases[0]->data<T>(),
-            relus[0]->mutable_data<T>(place), m, n, k);
+            relus[0]->mutable_data<T>(place), attr);
 
     for (int i = 1; i < weight_sz - 1; ++i) {
       auto i_dims = relus[i - 1]->dims();
       auto w_dims = weights[i]->dims();
-      int m = i_dims[0];
-      int n = w_dims[1];
-      int k = w_dims[0];
-      relus[i]->Resize({m, n});
+      attr.m = i_dims[0];
+      attr.n = w_dims[1];
+      attr.k = w_dims[0];
+      relus[i]->Resize({attr.m, attr.n});
       fc_relu(relus[i - 1]->data<T>(), weights[i]->data<T>(),
-              biases[i]->data<T>(), relus[i]->mutable_data<T>(place), m, n, k);
+              biases[i]->data<T>(), relus[i]->mutable_data<T>(place), attr);
     }
 
     auto i_dims_last = relus[weight_sz - 2]->dims();
     auto w_dims_last = weights[weight_sz - 1]->dims();
-    m = i_dims_last[0];
-    n = w_dims_last[1];
-    k = w_dims_last[0];
+    attr.m = i_dims_last[0];
+    attr.n = w_dims_last[1];
+    attr.k = w_dims_last[0];
     fc_relu(relus[weight_sz - 2]->data<T>(), weights[weight_sz - 1]->data<T>(),
-            biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place), m, n,
-            k);
+            biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place),
+            attr);
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index 00dafdead53bbd4614c70875441c565724fca46d..8c8b079633aacb711aa304ec7016c37c6bec61ce 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -87,15 +87,18 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
 
     auto x_dims = x->dims();
     auto y_dims = y->dims();
-    int m = x_dims[0];
-    int k = x_dims[1];
-    int n = y_dims[1];
-    int o_numel = m * n;
+    jit::matmul_attr_t attr;
+    attr.m = x_dims[0];
+    attr.k = x_dims[1];
+    attr.n = y_dims[1];
+    int o_numel = attr.m * attr.n;
 
     auto vsquare_x =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(m * k);
+        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.m *
+                                                                       attr.k);
     auto vsquare_y =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(k * n);
+        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.k *
+                                                                       attr.n);
     auto vsquare_xy =
         jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel);
     auto vsub =
@@ -103,7 +106,7 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
     auto vscal =
         jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel);
     auto matmul =
-        jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k);
+        jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
 
     const T* x_data = x->data<T>();
     const T* y_data = y->data<T>();
@@ -112,12 +115,12 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
     T* squared_xy_data = squared_xy->mutable_data<T>(place);
     T* o_data = out->mutable_data<T>(place);
 
-    matmul(x_data, y_data, squared_xy_data, m, n, k);
+    matmul(x_data, y_data, squared_xy_data, &attr);
     vsquare_xy(squared_xy_data, squared_xy_data, o_numel);
 
-    vsquare_x(x_data, squared_x_data, m * k);
-    vsquare_y(y_data, squared_y_data, k * n);
-    matmul(squared_x_data, squared_y_data, o_data, m, n, k);
+    vsquare_x(x_data, squared_x_data, attr.m * attr.k);
+    vsquare_y(y_data, squared_y_data, attr.k * attr.n);
+    matmul(squared_x_data, squared_y_data, o_data, &attr);
 
     vsub(squared_xy_data, o_data, o_data, o_numel);
     vscal(&scalar, o_data, o_data, o_numel);
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 5c5a61f64093802697eb21452267471129c7fcf3..1b9360afcecf63ff0c3e306cdf303cc426e80f1e 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -311,8 +311,9 @@ void BenchMatMulKernel() {
         const T* a_data = a.data<T>();
         const T* b_data = b.data<T>();
         T* c_data = c.mutable_data<T>(PlaceType());
-        BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(k, a_data, b_data,
-                                                           c_data, m, n, k);
+        const jit::matmul_attr_t attr{m, n, k};
+        BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(attr, a_data, b_data,
+                                                           c_data, &attr);
       }
     }
   }
diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt
index 2ea8f927e1a13867fa2065841fac05e766735237..efc7eb79d36c5cf9fac4ac40db4e2e28cb242e22 100644
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -9,6 +9,7 @@ function(USE_JITKERNEL_GEN TARGET)
 endfunction()
 
 # use gen jitcode kernel by name
+USE_JITKERNEL_GEN(kMatMul)
 USE_JITKERNEL_GEN(kVMul)
 USE_JITKERNEL_GEN(kVAdd)
 USE_JITKERNEL_GEN(kVSub)
diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae3858eab20aeb80553d8fcec4088a6632c9c17d
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/matmul.h"
+#include <stddef.h>  // offsetof
+#include <vector>
+
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void MatMulJitCode::genCode() {
+  preCode();
+  int block, rest;
+  const auto groups = packed_groups(n_, k_, &block, &rest);
+  PADDLE_ENFORCE_GT(groups.front(), 0);
+
+  const int block_len = sizeof(float) * block;
+  const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
+  const int w_reg_idx = x_reg_idx - 1;
+  // from packed mov(reg_ptr_wgt, ptr[param_attr + offsetof(matmul_attr_t,
+  // packed_weight)]);
+  mov(reg_ptr_wgt, param_y);
+  size_t z_offset = 0;
+  size_t wgt_offset = 0;
+  for (size_t g = 0; g < groups.size(); ++g) {
+    size_t x_offset = 0;
+    for (int k = 0; k < k_; ++k) {
+      vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]);
+      // clean
+      if (k == 0) {
+        for (int i = 0; i < groups[g]; ++i) {
+          vxorps(zmm_t(i), zmm_t(i), zmm_t(i));
+        }
+      }
+      for (int i = 0; i < groups[g]; ++i) {
+        vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]);
+        vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx));
+        wgt_offset += block_len;
+      }
+      // last one, save
+      if (k == k_ - 1) {
+        for (int i = 0; i < groups[g]; ++i) {
+          // only rest save should be careful
+          if (rest != 0 && g == groups.size() - 1 && i == groups[g] - 1) {
+            break;
+          }
+          vmovups(ptr[param_z + z_offset + i * block_len], zmm_t(i));
+        }
+      }
+      x_offset += sizeof(float);
+    }
+    z_offset += block_len * groups[g];
+  }
+
+  if (rest != 0) {
+    // below should refine with mask
+    int reg_idx = groups.back() - 1;
+    z_offset = (n_ - rest) * sizeof(float);
+    int inner_block = 8;
+    while (rest > 0) {
+      if (rest >= 8) {
+        inner_block = 8;
+        vmovups(ptr[param_z + z_offset], ymm_t(reg_idx));
+        // shift zmm of inner_block, change reg_idx if update
+      } else if (rest >= 4) {
+        inner_block = 4;
+        vmovups(ptr[param_z + z_offset], xmm_t(reg_idx));
+      } else if (rest >= 2) {
+        inner_block = 2;
+        vmovq(ptr[param_z + z_offset], xmm_t(reg_idx));
+      } else {
+        inner_block = 1;
+        vmovss(ptr[param_z + z_offset], xmm_t(reg_idx));
+      }
+      z_offset += inner_block * sizeof(float);
+      rest -= inner_block;
+    }
+  }
+
+  postCode();
+}
+
+class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
+ public:
+  bool UseMe(const matmul_attr_t& attr) const override {
+    return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
+           attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
+  }
+  size_t CodeSize(const matmul_attr_t& attr) const override {
+    int block = YMM_FLOAT_BLOCK;
+    if (platform::MayIUse(platform::avx512f)) {
+      block = ZMM_FLOAT_BLOCK;
+    }
+    return 96 + 4 * attr.k * (attr.n / block + 1) * 8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(
+      const matmul_attr_t& attr) const override {
+    PADDLE_ENFORCE_GT(attr.m, 0);
+    PADDLE_ENFORCE_GT(attr.n, 0);
+    PADDLE_ENFORCE_GT(attr.k, 0);
+    return make_unique<MatMulJitCode>(attr, CodeSize(attr));
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator);
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
new file mode 100644
index 0000000000000000000000000000000000000000..626baa8f738bf0395f3c7f1700610d0a9075879b
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <stdlib.h>  // for malloc and free
+#include <string>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class MatMulJitCode : public JitCode {
+ public:
+  explicit MatMulJitCode(const matmul_attr_t& attr,
+                         size_t code_size = 256 * 1024,
+                         void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
+    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
+    this->genCode();
+  }
+
+  virtual const char* name() const {
+    std::string base = "MatMulJitCode";
+    base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
+           std::to_string(k_);
+    return base.c_str();
+  }
+  void genCode() override;
+
+ private:
+  int m_, n_, k_;
+
+  reg64_t param_x{abi_param1};
+  reg64_t param_y{abi_param2};
+  reg64_t param_z{abi_param3};
+  reg64_t param_attr{abi_param4};
+  reg64_t reg_tmp{rax};
+
+  reg64_t reg_ptr_wgt{r10};
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
index 310da0c76f1ab251d788e54f2305f375f3fb4838..3cd5f6554bdc188ce9ea0c0b85c84d032c509600 100644
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -16,6 +16,8 @@
 #include <fstream>
 #include <iostream>
 #include <sstream>
+#include <vector>
+#include "paddle/fluid/platform/cpu_info.h"
 
 DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
 
@@ -38,6 +40,35 @@ void GenBase::dumpCode(const unsigned char* code) const {
   }
 }
 
+std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
+  int block;
+  int max_num_regs;
+  if (platform::MayIUse(platform::avx512f)) {
+    block = ZMM_FLOAT_BLOCK;
+    max_num_regs = 32;
+  } else {
+    block = YMM_FLOAT_BLOCK;
+    max_num_regs = 16;
+  }
+  // one for x, one for y, others for z
+  const int max_used_regs_for_n = max_num_regs - 2;
+  const int aligned_n = n % block == 0 ? n : (n / block + 1) * block;
+  const int num_block = aligned_n / block;
+  const int num_groups = num_block / max_used_regs_for_n;
+  std::vector<int> groups(num_groups, max_used_regs_for_n);
+  int rest_num_regs = num_block % max_used_regs_for_n;
+  if (rest_num_regs != 0) {
+    groups.push_back(rest_num_regs);
+  }
+  if (block_out) {
+    *block_out = block;
+  }
+  if (rest_out) {
+    *rest_out = n % block;
+  }
+  return groups;
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index 4af01a437670aa6a07d370ff23ed2abd369f69a3..d808a332472ae86240cb63356cb417123523366a 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -16,6 +16,7 @@
 
 #include <gflags/gflags.h>
 #include <memory>  // for unique_ptr
+#include <vector>
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 DECLARE_bool(dump_jitcode);
@@ -67,6 +68,11 @@ class JitCodeCreator : public GenCreator {
   virtual std::unique_ptr<GenBase> CreateJitCode(const Attr& attr) const = 0;
 };
 
+// unify the method of packed groups
+// output the packed groups which used in weights, the block size and rest size
+std::vector<int> packed_groups(int n, int k, int* block = nullptr,
+                               int* rest = nullptr);
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
index 4dac2f2460f72c7da63f48c82549b948cc253153..e7292fe2bd8031aa5bbff68e7c2305a238085bf1 100644
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/operators/jit/helper.h"
 #include <algorithm>  // tolower
+#include <numeric>
+#include <string>
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -91,6 +93,41 @@ KernelType to_kerneltype(const std::string& act) {
   return kNone;
 }
 
+template <>
+void pack_weights<float>(const float* src, float* dst, int n, int k) {
+  int block, rest;
+  const auto groups = packed_groups(n, k, &block, &rest);
+  std::for_each(groups.begin(), groups.end(), [&](int i) {
+    PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
+  });
+  int sum = std::accumulate(groups.begin(), groups.end(), 0);
+  std::memset(dst, 0, k * sum * block * sizeof(float));
+  PADDLE_ENFORCE_GE(sum * block, n,
+                    "The packed n should be equal to or larger than n");
+
+  const int block_len = sizeof(float) * block;
+  int n_offset = 0;
+
+  for (size_t g = 0; g < groups.size(); ++g) {
+    const float* from = src + n_offset;
+    for (int j = 0; j < k; ++j) {
+      size_t copy_sz = groups[g] * block_len;
+      if (g == groups.size() - 1 && rest != 0) {
+        copy_sz = (groups[g] - 1) * block_len + rest * sizeof(float);
+      }
+      std::memcpy(dst, from + j * n, copy_sz);
+      dst += groups[g] * block;
+    }
+    n_offset += groups[g] * block;
+  }
+}
+
+template <typename T>
+typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
+    const T* src, T* dst, int n, int k) {
+  PADDLE_THROW("Only support pack with float type.");
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
index 7bdc45779b7d39d36db0d52ca9361943cdcdef3e..bba3a13619619b6de3f797a4efc4a0d09c3b281f 100644
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -152,17 +152,28 @@ inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) {
      << (attr.use_peephole ? "True" : "False") << "]";
   return os;
 }
+
 inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
   os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
      << "],act_cand[" << to_string(attr.act_cand) << "]";
   return os;
 }
+
 inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
   os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
      << to_string(attr.type) << "]";
   return os;
 }
 
+inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) {
+  os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]";
+  return os;
+}
+
+// expose the method to pack matmul weight
+template <typename T>
+void pack_weights(const T* src, T* dst, int n, int k);
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h
index 42a58580f7b1e0832af57398ba9c29882b6cc6fb..4a8f61146a1921fa1d5f6b7e15af40cd45d31a22 100644
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -145,11 +145,19 @@ struct SeqPoolTuples {
   typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
 };
 
+typedef struct matmul_attr_s {
+  int m, n, k;
+  void* packed_weight{nullptr};
+  matmul_attr_s() = default;
+  explicit matmul_attr_s(int m_, int n_, int k_, void* packed_weight_ = nullptr)
+      : m(m_), n(n_), k(k_), packed_weight(packed_weight_) {}
+} matmul_attr_t;
+
 template <typename T>
 struct MatMulTuples {
   typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, const T*, T*, int, int, int);
+  typedef matmul_attr_t attr_type;
+  typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc
index 61de38688664f83775c0c4e5aa6f7e06c3602ddb..1e4a8884e78c5d3c1748988f05ecf461a6f0eb94 100644
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -49,6 +49,13 @@ size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
   return (key << pool_type_shift) + static_cast<int>(attr.type);
 }
 
+template <>
+size_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
+  size_t key = attr.m;
+  constexpr int shift = 21;
+  return (key << shift * 2) + ((static_cast<size_t>(attr.n)) << shift) + attr.k;
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
index 28a37198dae19a57509934ec784746bc23436e7a..c7d0215eda9d1e14fcad16da7b70f45824789266 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -25,17 +25,19 @@ namespace more {
 namespace mkl {
 
 template <>
-void MatMul<float>(const float* a, const float* b, float* c, int m, int n,
-                   int k) {
-  platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m,
-                                 n, k, 1.f, a, k, b, n, 0.f, c, n);
+void MatMul<float>(const float* a, const float* b, float* c,
+                   const matmul_attr_t* attr) {
+  platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                                 attr->m, attr->n, attr->k, 1.f, a, attr->k, b,
+                                 attr->n, 0.f, c, attr->n);
 }
 
 template <>
-void MatMul<double>(const double* a, const double* b, double* c, int m, int n,
-                    int k) {
-  platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m,
-                                 n, k, 1.0, a, k, b, n, 0.0, c, n);
+void MatMul<double>(const double* a, const double* b, double* c,
+                    const matmul_attr_t* attr) {
+  platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                                 attr->m, attr->n, attr->k, 1.0, a, attr->k, b,
+                                 attr->n, 0.0, c, attr->n);
 }
 
 template <>
@@ -127,11 +129,6 @@ void ASum<double>(const double* x, double* res, int n) {
 }
 
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
-template <>
-bool MatMulKernel<float>::UseMe(const int& d) const {
-  return platform::MayIUse(platform::avx);
-}
-
 template <>
 bool VMulKernel<float>::UseMe(const int& d) const {
   return platform::MayIUse(platform::avx512f) && d > 512;
@@ -177,6 +174,16 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
   return true;
 }
 
+template <>
+bool MatMulKernel<float>::UseMe(const matmul_attr_t& attr) const {
+  return platform::MayIUse(platform::avx);
+}
+
+template <>
+bool MatMulKernel<double>::UseMe(const matmul_attr_t& attr) const {
+  return true;
+}
+
 template <>
 bool SoftmaxKernel<float>::UseMe(const int& d) const {
   // tuned on avx2
@@ -189,7 +196,6 @@ bool SoftmaxKernel<float>::UseMe(const int& d) const {
     return true;                                         \
   }
 
-AWALYS_USE_ME_WITH_DOUBLE(MatMul);
 AWALYS_USE_ME_WITH_DOUBLE(VMul);
 AWALYS_USE_ME_WITH_DOUBLE(VAdd);
 AWALYS_USE_ME_WITH_DOUBLE(VScal);
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index 6b95b9c872dc12cccaef0b0737edd760447a47d0..8130b87326f1887f232022ab30fa7bf42b0723e7 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -26,7 +26,7 @@ namespace more {
 namespace mkl {
 
 template <typename T>
-void MatMul(const T* a, const T* b, T* c, int m, int n, int k);
+void MatMul(const T* a, const T* b, T* c, const matmul_attr_t* attr);
 
 template <typename T>
 void VMul(const T* x, const T* y, T* z, int n);
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index 5a074db7e0e8ab49dc281e1809edef23e6a25c42..0c4a985f8e8ece0a6169478fa3a9b111f5a6f3b4 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -363,17 +363,19 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
 
 // A(M,K) * B(K,N) = C(M,N)
 template <typename T>
-void MatMul(const T* A, const T* B, T* C, int M, int N, int K) {
+void MatMul(const T* A, const T* B, T* C, const matmul_attr_t* attr) {
+  int M = attr->m;
+  int N = attr->n;
+  int K = attr->k;
   for (int m = 0; m < M; ++m) {
     const T* pa = A + m * K;
     T* pc = C + m * N;
     for (int n = 0; n < N; ++n) {
       const T* pb = B + n;
-      T sum = static_cast<T>(0);
-      for (int k = 0; k < K; ++k) {
-        sum += (pa[k] * pb[k * N]);
+      pc[n] = pa[0] * pb[0];
+      for (int k = 1; k < K; ++k) {
+        pc[n] += pa[k] * pb[k * N];
       }
-      *(pc + n) = sum;
     }
   }
 }
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index cc461552898fc68661ce548a520d65215d3572b4..237e588d35cc3b33658a830db34676967818aab6 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -22,7 +22,7 @@
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
-static double acc = 1e-5;
+DEFINE_double(acc, 1e-5, "Test accuracy threshold.");
 
 template <typename T>
 void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
@@ -39,7 +39,7 @@ template <typename T>
 void ExpectEQ(const T* target, const T* refer, int n) {
   if (std::is_floating_point<T>::value) {
     for (int i = 0; i < n; ++i) {
-      EXPECT_NEAR(target[i], refer[i], acc);
+      EXPECT_NEAR(target[i], refer[i], FLAGS_acc);
     }
   } else {
     for (int i = 0; i < n; ++i) {
@@ -272,21 +272,23 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
 
 template <typename T>
 struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>, int, int, int> {
+                         std::vector<T>,
+                         typename jit::MatMulTuples<T>::attr_type> {
   void operator()(const typename jit::MatMulTuples<T>::func_type tgt,
                   const std::vector<T>& a, const std::vector<T>& b,
-                  const std::vector<T>& cref, int m, int n, int k) {
+                  const std::vector<T>& cref,
+                  const typename jit::MatMulTuples<T>::attr_type& attr) {
     EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(a.size(), static_cast<size_t>(m * k));
-    EXPECT_EQ(b.size(), static_cast<size_t>(k * n));
-    EXPECT_EQ(cref.size(), static_cast<size_t>(m * n));
+    EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
+    EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
+    EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
     std::vector<T> c(cref.size());
     const T* a_data = a.data();
     const T* b_data = b.data();
     const T* cref_data = cref.data();
     T* c_data = c.data();
-    tgt(a_data, b_data, c_data, m, n, k);
-    ExpectEQ<T>(c_data, cref_data, m * n);
+    tgt(a_data, b_data, c_data, &attr);
+    ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
   }
 };
 
@@ -383,8 +385,8 @@ void TestAXYNKernel() {
 template <jit::KernelType KT, typename T, typename PlaceType>
 void TestXRNKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  auto last_acc = acc;
-  acc = 1e-4;
+  auto last_acc = FLAGS_acc;
+  FLAGS_acc = 1e-4;
   for (int d : TestSizes()) {
     auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>();
     EXPECT_TRUE(ref != nullptr);
@@ -395,7 +397,7 @@ void TestXRNKernel() {
     TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x,
                                                                       ref_res);
   }
-  acc = last_acc;
+  FLAGS_acc = last_acc;
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
@@ -535,9 +537,10 @@ void TestSeqPoolKernel() {
 template <jit::KernelType KT, typename T, typename PlaceType>
 void TestMatMulKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  auto last_acc = acc;
-  // TODO(intel): this should be acc issue of MKL
-  acc = 1e-3;
+  auto last_acc = FLAGS_acc;
+  // TODO(intel): fix MKL acc issue
+  // https://github.com/PaddlePaddle/Paddle/issues/15447
+  FLAGS_acc = 1e-3;
   for (int m : {1, 2, 3, 4}) {
     for (int n : {1, 2, 3, 4}) {
       for (int k : TestSizes()) {
@@ -549,13 +552,14 @@ void TestMatMulKernel() {
         const T* a_data = a.data();
         const T* b_data = b.data();
         T* c_data = c.data();
-        ref(a_data, b_data, c_data, m, n, k);
+        const jit::matmul_attr_t attr{m, n, k};
+        ref(a_data, b_data, c_data, &attr);
         TestAllImpls<KT, jit::MatMulTuples<T>, PlaceType, std::vector<T>,
-                     std::vector<T>, std::vector<T>>(k, a, b, c, m, n, k);
+                     std::vector<T>, std::vector<T>>(attr, a, b, c, attr);
       }
     }
   }
-  acc = last_acc;
+  FLAGS_acc = last_acc;
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>