refine and init jitkernel vmul

a53b1b0b · tensor-tang · 2139b9f6 · a53b1b0b · a53b1b0b · a53b1b0b
8 changed file
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -77,5 +77,5 @@ cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
 cc_library(jit_kernel 
    SRCS jit_kernel.cc jit_gen.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
-    DEPS cpu_info cblas gflags)
+    DEPS cpu_info cblas gflags enforce)
 cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -39,8 +39,8 @@ class Kernel {
 public:
  Kernel() = default;
  virtual ~Kernel() = default;
+  // TODO(TJ): below members should be deprecated.
  int num_{0};
-  // TODO(TJ): below two should be reomved.
  int end_{0};
  int rest_{0};
  DISABLE_COPY_AND_ASSIGN(Kernel);
@@ -65,7 +65,7 @@ class KernelPool {
 template <typename T>
 class VMulKernel : public Kernel {
 public:
-  virtual void Compute(const T *x, const T *y, T *z) const = 0;
+  void (*Compute)(const T *, const T *, T *, int);
 };

 template <typename T>

--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -14,7 +14,10 @@ limitations under the License. */

 #include "paddle/fluid/operators/math/jit_kernel.h"
 #include <string>
+#include "paddle/fluid/operators/math/jit_gen.h"
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#include "paddle/fluid/platform/enforce.h"
+
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
@@ -28,64 +31,97 @@ namespace operators {
 namespace math {
 namespace jitkernel {

-namespace jit = platform::jit;
+namespace jit = platform::jit;  // remove me
+
+using namespace platform::jit;  // NOLINT

 /* VMUL JitKernel */
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
-class VMulKernelImpl : public VMulKernel<T> {
- public:
-  explicit VMulKernelImpl(int d) : VMulKernel<T>() { this->num_ = d; }
-  void Compute(const T* x, const T* y, T* z) const override {
-    for (int i = 0; i < this->num_; ++i) {
-      z[i] = x[i] * y[i];
-    }
+struct VMulJitCode : public gen::JitCode {
+  DECLARE_JIT_CODE(VMulJitCode);
+  explicit VMulJitCode(size_t code_size = 256 * 1024, void* code_ptr = nullptr)
+      : gen::JitCode(code_size, code_ptr) {}
+  static bool init(int d) {
+    if (MayIUse(avx) || MayIUse(avx2)) {
+      return d % AVX_FLOAT_BLOCK == 0;
+    } else if (MayIUse(avx512f)) {
+      return d % AVX512_FLOAT_BLOCK == 0;
+    } else {
+      return false;
+    }
+  }
+  void generate() override {
+    preCode();
+    postCode();
  }
 };

-#ifdef PADDLE_WITH_MKLML
-#define MKL_FLOAT(isa, block)                           \
-  template <>                                           \
-  void VMulKernelImpl<float, isa, block>::Compute(      \
-      const float* x, const float* y, float* z) const { \
-    platform::dynload::vsMul(this->num_, x, y, z);      \
-  }
-
-#define MKL_DOUBLE(isa, block)                             \
-  template <>                                              \
-  void VMulKernelImpl<double, isa, block>::Compute(        \
-      const double* x, const double* y, double* z) const { \
-    platform::dynload::vdMul(this->num_, x, y, z);         \
+template <typename T>
+void VMulRefer(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
  }
+}

-FOR_EACH_ISA(MKL_FLOAT, kGT16);
-FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
+#ifdef PADDLE_WITH_MKLML
+template <typename T>
+void VMulMKL(const T* x, const T* y, T* z, int n);
+
+template <>
+void VMulMKL<float>(const float* x, const float* y, float* z, int n) {
+  platform::dynload::vsMul(n, x, y, z);
+}
+template <>
+void VMulMKL<double>(const double* x, const double* y, double* z, int n) {
+  platform::dynload::vdMul(n, x, y, z);
+}
 #endif

-#define INTRI8_FLOAT(isa)                               \
-  template <>                                           \
-  void VMulKernelImpl<float, isa, kEQ8>::Compute(       \
-      const float* x, const float* y, float* z) const { \
-    __m256 tmpx, tmpy;                                  \
-    tmpx = _mm256_loadu_ps(x);                          \
-    tmpy = _mm256_loadu_ps(y);                          \
-    tmpx = _mm256_mul_ps(tmpx, tmpy);                   \
-    _mm256_storeu_ps(z, tmpx);                          \
+template <typename T>
+class VMulKernelImpl : public VMulKernel<T> {
+ public:
+  static inline std::string name(int d) {
+    PADDLE_THROW("DType should be either float or double");
  }
+  static inline bool useJIT(int d) { return false; }
+  static inline bool useMKL(int d) { return false; }

-// avx > for > mkl
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
-#endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f);
+  explicit VMulKernelImpl(int d) : VMulKernel<T>() {
+    if (useJIT(d)) {
+      constexpr size_t sz = 256 * 1024;  // TODO(TJ): should be related with d
+      jitcode_.reset(new VMulJitCode(sz));
+      this->Compute =
+          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
+      return;
+    }
+#ifdef PADDLE_WITH_MKLML
+    if (useMKL(d)) {
+      this->Compute = VMulMKL<T>;
+      return;
+    }
 #endif
-// TODO(TJ): eq16 test and complete avx512
-#undef INTRI8_FLOAT
-#undef MKL_FLOAT
-#undef MKL_DOUBLE
+    this->Compute = VMulRefer<T>;
+  }
+
+ private:
+  std::unique_ptr<VMulJitCode> jitcode_{nullptr};
+};
+
+template <>
+bool VMulKernelImpl<float>::useJIT(int d) {
+  return VMulJitCode::init(d);
+}
+
+template <>
+bool VMulKernelImpl<float>::useMKL(int d) {
+  return jit::MayIUse(jit::avx512f) && d > 512;
+}
+
+template <>
+bool VMulKernelImpl<double>::useMKL(int d) {
+  return true;
+}
+
+REGISTER_JITKERNEL(vmul, VMulKernel);

 /* VADD JitKernel */
 template <typename T, platform::jit::cpu_isa_t isa, jit_block>
@@ -465,13 +501,12 @@ INTRI_COMMON_FLOAT(jit::avx512f, kGT16);
 #undef INTRI16_FLOAT
 #undef INTRI_COMMON_FLOAT

-REGISTER_JITKERNEL(vmul, VMulKernel);
-REGISTER_JITKERNEL(vadd, VAddKernel);
-REGISTER_JITKERNEL(vscal, VScalKernel);
-REGISTER_JITKERNEL(vaddb, VAddBiasKernel);
-REGISTER_JITKERNEL(vrelu, VReluKernel);
-REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
-REGISTER_JITKERNEL(videntity, VIdentityKernel);
+REGISTER_JITKERNEL_DEPRECATED(vadd, VAddKernel);
+REGISTER_JITKERNEL_DEPRECATED(vscal, VScalKernel);
+REGISTER_JITKERNEL_DEPRECATED(vaddb, VAddBiasKernel);
+REGISTER_JITKERNEL_DEPRECATED(vrelu, VReluKernel);
+REGISTER_JITKERNEL_DEPRECATED(vaddrelu, VAddReluKernel);
+REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel);

 }  // namespace jitkernel
 }  // namespace math

--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
@@ -288,7 +288,7 @@ INTRIAVX512_FLOAT(kGT16);
 #undef INIT_ALPHA
 #undef UPDATE_ALPHA

-REGISTER_JITKERNEL(crf_decode, CRFDecodeKernel);
+REGISTER_JITKERNEL_DEPRECATED(crf_decode, CRFDecodeKernel);

 }  // namespace jitkernel
 }  // namespace math

--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -250,7 +250,7 @@ INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
 #undef MKL_FLOAT
 #undef MKL_DOUBLE

-REGISTER_JITKERNEL(vexp, VExpKernel);
+REGISTER_JITKERNEL_DEPRECATED(vexp, VExpKernel);

 /* VSigmoid JitKernel */
 template <typename T, jit::cpu_isa_t isa, jit_block>
@@ -396,7 +396,7 @@ INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
 #undef INTRI_GT16_FLOAT
 #undef INTRI_VSIGMOID

-REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel);
+REGISTER_JITKERNEL_DEPRECATED(vsigmoid, VSigmoidKernel);

 /* VTanh JitKernel */
 template <typename T, jit::cpu_isa_t isa, jit_block>
@@ -531,7 +531,7 @@ INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
 #undef INTRI_GT16_FLOAT
 #undef INTRI_VTANH

-REGISTER_JITKERNEL(vtanh, VTanhKernel);
+REGISTER_JITKERNEL_DEPRECATED(vtanh, VTanhKernel);

 #undef JITKERNEL_NEW_ACT_IMPL


--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
@@ -21,8 +21,71 @@ namespace operators {
 namespace math {
 namespace jitkernel {

-namespace jit = platform::jit;
+#define JITKERNEL_DEFINE_NAME(ker_key, ker_class)    \
+  template <>                                        \
+  std::string ker_class##Impl<float>::name(int d) {  \
+    std::string key(#ker_key "f");                   \
+    if (useJIT(d)) {                                 \
+      /* only jit code need record d*/               \
+      return key + "jit" + std::to_string(d);        \
+    } else if (useMKL(d)) {                          \
+      return key + "mkl";                            \
+    } else {                                         \
+      return key + "any";                            \
+    }                                                \
+  }                                                  \
+  template <>                                        \
+  std::string ker_class##Impl<double>::name(int d) { \
+    std::string key(#ker_key "d");                   \
+    /* jit code do not support double yet*/          \
+    if (useMKL(d)) {                                 \
+      return key + "mkl";                            \
+    } else {                                         \
+      return key + "any";                            \
+    }                                                \
+  }
+
+#define JITKERNEL_DECLARE(ker_class, ker_dtype) \
+  template <>                                   \
+  std::shared_ptr<const ker_class<ker_dtype>>   \
+  KernelPool::Get<ker_class<ker_dtype>, int>(int d)
+
+#define JITKERNEL_FIND_KEY(ker_class, ker_dtype) \
+  std::string key = ker_class##Impl<ker_dtype>::name(d)

+#define JITKERNEL_IMPL(ker_class, ker_dtype)           \
+  p = std::dynamic_pointer_cast<ker_class<ker_dtype>>( \
+      std::make_shared<ker_class##Impl<ker_dtype>>(d))
+
+#define REGISTER_JITKERNEL_WITH_DTYPE(ker_class, ker_dtype, marco_declare, \
+                                      macro_find_key, macro_impl)          \
+  marco_declare(ker_class, ker_dtype) {                                    \
+    macro_find_key(ker_class, ker_dtype);                                  \
+    if (kers_.find(key) == kers_.end()) {                                  \
+      std::shared_ptr<ker_class<ker_dtype>> p;                             \
+      macro_impl(ker_class, ker_dtype);                                    \
+      kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)});           \
+      return p;                                                            \
+    }                                                                      \
+    return std::dynamic_pointer_cast<const ker_class<ker_dtype>>(          \
+        kers_.at(key));                                                    \
+  }
+
+#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name,     \
+                                marco_declare, macro_find_key, macro_impl) \
+  marco_define_name(ker_key, ker_class);                                   \
+  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, JITKERNEL_DECLARE,       \
+                                JITKERNEL_FIND_KEY, JITKERNEL_IMPL);       \
+  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, JITKERNEL_DECLARE,      \
+                                JITKERNEL_FIND_KEY, JITKERNEL_IMPL)
+
+#define REGISTER_JITKERNEL(ker_key, ker_class)                       \
+  REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \
+                          JITKERNEL_DECLARE, JITKERNEL_FIND_KEY,     \
+                          JITKERNEL_IMPL)
+
+namespace jit = platform::jit;
+// TODO(TJ): below defines are deprecated, would be remove recently
 #define SEARCH_BLOCK(macro_, ker, dtype, isa)                 \
  if (d < AVX_FLOAT_BLOCK) {                                  \
    macro_(ker, dtype, isa, kLT8);                            \
@@ -47,20 +110,16 @@ namespace jit = platform::jit;
    SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \
  }

-#define JITKERNEL_DECLARE(ker_class, ker_dtype) \
-  template <>                                   \
-  std::shared_ptr<const ker_class<ker_dtype>>   \
-  KernelPool::Get<ker_class<ker_dtype>, int>(int d)
-
 #define JITKERNEL_KEY(ker_key, dtype_key) \
  #ker_key #dtype_key + std::to_string(d)

-#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \
+#define JITKERNEL_NEW_IMPL_DEPRECATED(ker, dtype, isa, k) \
  p = std::dynamic_pointer_cast<ker<dtype>>(              \
      std::make_shared<ker##Impl<dtype, isa, k>>(d))

-#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \
-                             marco_declare, macro_key, macro_impl)     \
+#define JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, ker_dtype,       \
+                                        dtype_key, marco_declare, macro_key, \
+                                        macro_impl)                          \
  marco_declare(ker_class, ker_dtype) {                                      \
    std::string key = macro_key(ker_key, dtype_key);                         \
    if (kers_.find(key) == kers_.end()) {                                    \
@@ -73,18 +132,20 @@ namespace jit = platform::jit;
        kers_.at(key));                                                      \
  }

-#define REGISTER_JITKERNEL(ker_key, ker_class)                           \
-  JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, JITKERNEL_DECLARE,  \
-                       JITKERNEL_KEY, JITKERNEL_NEW_IMPL);               \
-  JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, JITKERNEL_DECLARE, \
-                       JITKERNEL_KEY, JITKERNEL_NEW_IMPL)
-
-#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_declare, macro_key,  \
-                                macro_impl)                                    \
-  JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, marco_declare, macro_key, \
-                       macro_impl);                                            \
-  JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, marco_declare,           \
-                       macro_key, macro_impl)
+#define REGISTER_JITKERNEL_DEPRECATED(ker_key, ker_class)           \
+  JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, float, f,     \
+                                  JITKERNEL_DECLARE, JITKERNEL_KEY, \
+                                  JITKERNEL_NEW_IMPL_DEPRECATED);   \
+  JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, double, d,    \
+                                  JITKERNEL_DECLARE, JITKERNEL_KEY, \
+                                  JITKERNEL_NEW_IMPL_DEPRECATED)
+
+#define REGISTER_JITKERNEL_ARGS_DEPRECATED(ker_key, ker_class, marco_declare,  \
+                                           macro_key, macro_impl)              \
+  JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, float, f, marco_declare, \
+                                  macro_key, macro_impl);                      \
+  JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, double, d,               \
+                                  marco_declare, macro_key, macro_impl)

 #define FOR_EACH_ISA(macro_, block) \
  macro_(jit::avx512f, block);      \

--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -179,23 +179,23 @@ class LSTMKernelImpl : public LSTMKernel<T> {

    /* C_t = C_t-1 * fgated + cand_gated * igated */
    act_cand_d_->Compute(gates, gates);
-    vmul_d_->Compute(gates, gates + d_, gates + d_);
-    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_);
+    vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
+    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
    vadd_d_->Compute(gates + d_, gates + d2_, ct);

    /* H_t = act_cell(C_t) * ogated */
    act_cell_d_->Compute(ct, gates + d2_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
  }
  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
    /* C_t = igated * cgated*/
    act_gate_d_->Compute(gates + d_, gates + d_);
    act_cand_d_->Compute(gates, gates);
-    vmul_d_->Compute(gates, gates + d_, ct);
+    vmul_d_->Compute(gates, gates + d_, ct, d_);
    /* H_t = act_cell(C_t) * ogated */
    act_gate_d_->Compute(gates + d3_, gates + d3_);
    act_cell_d_->Compute(ct, gates + d2_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
  }

 private:
@@ -289,36 +289,36 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
                   T* checked) const override {
    /* get fgated and igated*/
-    vmul_d_->Compute(wp_data, ct_1, checked);
-    vmul_d_->Compute(wp_data + d_, ct_1, checked + d_);
+    vmul_d_->Compute(wp_data, ct_1, checked, d_);
+    vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_);
    vadd_d2_->Compute(checked, gates + d_, gates + d_);
    act_gate_d2_->Compute(gates + d_, gates + d_);
    /* C_t = C_t-1 * fgated + cand_gated * igated*/
    act_cand_d_->Compute(gates, gates);
-    vmul_d_->Compute(gates, gates + d_, gates + d_);
-    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_);
+    vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
+    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
    vadd_d_->Compute(gates + d_, gates + d2_, ct);
    /* get ogated*/
-    vmul_d_->Compute(wp_data + d2_, ct, gates + d_);
+    vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
    act_gate_d_->Compute(gates + d3_, gates + d3_);
    /* H_t = act_cell(C_t) * ogated */
    act_cell_d_->Compute(ct, gates + d2_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
  }

  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
    /* C_t = igated * cgated*/
    act_gate_d_->Compute(gates + d_, gates + d_);
    act_cand_d_->Compute(gates, gates);
-    vmul_d_->Compute(gates, gates + d_, ct);
+    vmul_d_->Compute(gates, gates + d_, ct, d_);
    /* get outgated, put W_oc * C_t on igated */
-    vmul_d_->Compute(wp_data + d2_, ct, gates + d_);
+    vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
    /* H_t = act_cell(C_t) * ogated */
    act_gate_d_->Compute(gates + d3_, gates + d3_);
    act_cell_d_->Compute(ct, gates + d2_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
  }

 private:
@@ -352,7 +352,7 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
                                                   act_cell, d));      \
  }

-REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM,
+REGISTER_JITKERNEL_ARGS_DEPRECATED(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM,
                                   JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL);

 #undef INTRI8_FLOAT
@@ -378,13 +378,13 @@ class GRUKernelImpl : public GRUKernel<T> {
  void ComputeH1(T* gates, T* ht) const override {
    act_gate_d_->Compute(gates, gates);
    act_state_d_->Compute(gates + d2_, gates + d2_);
-    vmul_d_->Compute(gates, gates + d2_, ht);
+    vmul_d_->Compute(gates, gates + d2_, ht, d_);
  }

  void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override {
    // W: {W_update, W_reset; W_state}
    act_gate_d2_->Compute(gates, gates);
-    vmul_d_->Compute(ht_1, gates + d_, ht);
+    vmul_d_->Compute(ht_1, gates + d_, ht, d_);
  }

  void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override {
@@ -472,7 +472,7 @@ INTRI8_FLOAT(jit::avx512f);
  p = std::dynamic_pointer_cast<ker<dtype>>(       \
      std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_state, d));

-REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DECLARE_GRU,
+REGISTER_JITKERNEL_ARGS_DEPRECATED(gru, GRUKernel, JITKERNEL_DECLARE_GRU,
                                   JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL);

 #undef INTRI8_FLOAT

--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -369,12 +369,12 @@ void lstm_ctht_better(
  int d2 = d * 2;
  vsigmoid_3d->Compute(gates + d, gates + d);
  vtanh_d->Compute(gates, gates);
-  vmul_d->Compute(gates, gates + d, gates + d);
-  vmul_d->Compute(ct_1, gates + d2, gates + d2);
+  vmul_d->Compute(gates, gates + d, gates + d, d);
+  vmul_d->Compute(ct_1, gates + d2, gates + d2, d);
  vadd_d->Compute(gates + d, gates + d2, ct);
  /* H_t = act_cell(C_t) * ogated */
  vtanh_d->Compute(ct, gates + d2);
-  vmul_d->Compute(gates + d2, gates + d * 3, ht);
+  vmul_d->Compute(gates + d2, gates + d * 3, ht, d);
 }

 TEST(JitKernel, lstm) {
@@ -578,7 +578,7 @@ void vmul_mkl(const int n, const float* x, const float* y, float* z) {

 TEST(JitKernel, vmul) {
  namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+  for (int d : {7, 8, 15, 16, 30, 256, 512, 1000, 1024}) {
    std::vector<float> x(d), y(d);
    std::vector<float> zref(d), ztgt(d);
    RandomVec<float>(d, x.data());
@@ -616,7 +616,7 @@ TEST(JitKernel, vmul) {

    auto ttgts = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, y_data, ztgt_data);
+      ker->Compute(x_data, y_data, ztgt_data, d);
    }
    auto ttgte = GetCurrentUS();

@@ -800,8 +800,8 @@ TEST(JitKernel, pool) {
  EXPECT_TRUE(std::dynamic_pointer_cast<const jit::Kernel>(pvmul_f) !=
              std::dynamic_pointer_cast<const jit::Kernel>(pvmul_d));

-  const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulf4");
+  const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulfany");
  EXPECT_EQ(pvmul_f, pvmul_from_key);
-  const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulf5");
+  const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulfjit");
  EXPECT_TRUE(pvmul_from_key2 == nullptr);
 }