test.cc

/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <algorithm>
#include <random>
#include <string>
#include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h"

DEFINE_double(acc, 1e-5, "Test accuracy threshold.");

template <typename T>
void RandomVec(const int n, T* a, const T lower = static_cast<T>(-2.f),
               const T upper = static_cast<T>(2.f)) {
  static unsigned int seed = 100;
  std::mt19937 rng(seed++);
  std::uniform_real_distribution<double> uniform_dist(0, 1);
  for (int i = 0; i < n; ++i) {
    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
  }
}

template <typename T>
void ExpectEQ(const T* target, const T* refer, size_t n) {
  if (std::is_floating_point<T>::value) {
    for (size_t i = 0; i < n; ++i) {
      EXPECT_NEAR(target[i], refer[i], FLAGS_acc) << " at index : " << i;
    }
  } else {
    for (size_t i = 0; i < n; ++i) {
      EXPECT_EQ(target[i], refer[i]) << " at index : " << i;
    }
  }
}

std::vector<int> TestSizes() {
  std::vector<int> s;
  for (int i = 1; i < 32; ++i) {
    s.push_back(i);
  }
  // test some large size
  s.push_back(100);
  s.push_back(1000);
  s.push_back(2000);
  return s;
}

namespace jit = paddle::operators::jit;
using CPUPlace = paddle::platform::CPUPlace;

template <typename KernelTuples, typename... Args>
struct TestFuncWithRefer {
  void operator()(const typename KernelTuples::func_type tgt, Args... args) {
    LOG(FATAL) << "Should specify this function.";
  }
};

template <typename T>
struct TestFuncWithRefer<jit::XYZNTuples<T>, std::vector<T>, std::vector<T>,
                         std::vector<T>> {
  void operator()(const typename jit::XYZNTuples<T>::func_type tgt,
                  const std::vector<T>& x, const std::vector<T>& y,
                  const std::vector<T>& zref) {
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(zref.size(), x.size());
    EXPECT_EQ(zref.size(), y.size());
    const T* x_data = x.data();
    const T* y_data = y.data();
    const T* zref_data = zref.data();
    const int d = zref.size();

    std::vector<T> ztgt(d);
    T* ztgt_data = ztgt.data();
    // test normal
    tgt(x_data, y_data, ztgt_data, d);
    ExpectEQ<T>(ztgt_data, zref_data, d);
    // test inplace x
    std::copy(x.begin(), x.end(), ztgt.begin());
    tgt(ztgt_data, y_data, ztgt_data, d);
    ExpectEQ<T>(ztgt_data, zref_data, d);
    // test inplace y
    std::copy(y.begin(), y.end(), ztgt.begin());
    tgt(x_data, ztgt_data, ztgt_data, d);
    ExpectEQ<T>(ztgt_data, zref_data, d);
  }
};

template <typename T>
struct TestFuncWithRefer<jit::AXYNTuples<T>, T, std::vector<T>,
                         std::vector<T>> {
  void operator()(const typename jit::AXYNTuples<T>::func_type tgt, const T a,
                  const std::vector<T>& x, const std::vector<T>& yref) {
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(yref.size(), x.size());
    const T* x_data = x.data();
    const T* yref_data = yref.data();
    const int d = yref.size();
    std::vector<T> ytgt(d);
    T* ytgt_data = ytgt.data();
    // test normal
    tgt(&a, x_data, ytgt_data, d);
    ExpectEQ<T>(ytgt_data, yref_data, d);
    // test inplace x
    std::copy(x.begin(), x.end(), ytgt.begin());
    tgt(&a, ytgt_data, ytgt_data, d);
    ExpectEQ<T>(ytgt_data, yref_data, d);
  }
};

template <typename T>
struct TestFuncWithRefer<jit::SoftmaxTuples<T>, std::vector<T>, std::vector<T>,
                         int, int> {
  void operator()(const typename jit::SoftmaxTuples<T>::func_type tgt,
                  const std::vector<T>& x, const std::vector<T>& yref, int n,
                  int bs) {
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(yref.size(), x.size());
    EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
    const T* x_data = x.data();
    const T* yref_data = yref.data();
    std::vector<T> ytgt(n * bs);
    T* ytgt_data = ytgt.data();
    // test normal
    tgt(x_data, ytgt_data, n, bs);
    ExpectEQ<T>(ytgt_data, yref_data, n * bs);
    // test inplace x
    std::copy(x.begin(), x.end(), ytgt.begin());
    tgt(ytgt_data, ytgt_data, n, bs);
    ExpectEQ<T>(ytgt_data, yref_data, n * bs);
  }
};

template <typename T>
struct TestFuncWithRefer<jit::XRNTuples<T>, std::vector<T>, T> {
  void operator()(const typename jit::XRNTuples<T>::func_type tgt,
                  const std::vector<T>& x, const T ref_res) {
    EXPECT_TRUE(tgt != nullptr);
    T tgt_res;
    tgt(x.data(), &tgt_res, x.size());
    ExpectEQ<T>(&tgt_res, &ref_res, 1);
  }
};

template <typename T>
struct TestFuncWithRefer<jit::VBroadcastTuples<T>, std::vector<T>,
                         std::vector<T>, int64_t,
                         typename jit::VBroadcastTuples<T>::attr_type> {
  void operator()(const typename jit::VBroadcastTuples<T>::func_type tgt,
                  const std::vector<T>& x, const std::vector<T>& yref,
                  int64_t h,
                  const typename jit::VBroadcastTuples<T>::attr_type& attr) {
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(x.size(), static_cast<size_t>(attr));
    EXPECT_EQ(yref.size(), x.size() * h);
    std::vector<T> y(yref.size());
    const T* x_data = x.data();
    const T* yref_data = yref.data();
    T* y_data = y.data();
    tgt(x_data, y_data, h, attr);
    ExpectEQ<T>(y_data, yref_data, yref.size());
  }
};

template <typename T>
struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
  void operator()(const typename jit::XYNTuples<T>::func_type tgt,
                  const std::vector<T>& x, const std::vector<T>& yref) {
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(yref.size(), x.size());
    const T* x_data = x.data();
    const T* yref_data = yref.data();
    const int d = yref.size();
    std::vector<T> ytgt(d);
    T* ytgt_data = ytgt.data();
    // test normal
    tgt(x_data, ytgt_data, d);
    ExpectEQ<T>(ytgt_data, yref_data, d);
    // test inplace x
    std::copy(x.begin(), x.end(), ytgt.begin());
    tgt(ytgt_data, ytgt_data, d);
    ExpectEQ<T>(ytgt_data, yref_data, d);
  }
};

template <typename T>
struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
                         std::vector<T>, std::vector<T>, std::vector<T>,
                         typename jit::LSTMTuples<T>::attr_type> {
  void operator()(const typename jit::LSTMTuples<T>::func_type tgt,
                  const std::vector<T>& xsrc, const std::vector<T>& wp,
                  const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
                  const std::vector<T>& ht_ref,
                  const typename jit::LSTMTuples<T>::attr_type& attr) {
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(ct_ref.size(), ht_ref.size());
    EXPECT_EQ(ct_1.size(), ht_ref.size());
    EXPECT_EQ(xsrc.size(), 4 * ht_ref.size());
    EXPECT_EQ(wp.size(), 3 * ht_ref.size());

    // x could be changed after compute, so copy to save src
    int d = ht_ref.size();
    std::vector<T> x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size());
    std::vector<T> checked(2 * d);
    std::copy(xsrc.begin(), xsrc.end(), x.begin());

    const T* ct_1_data = ct_1.data();
    const T* wp_data = wp.data();
    const T* ct_ref_data = ct_ref.data();
    const T* ht_ref_data = ht_ref.data();
    T* x_data = x.data();
    T* ct_data = ct.data();
    T* ht_data = ht.data();
    T* checked_data = checked.data();

    jit::lstm_t step;
    step.gates = x_data;
    step.ct_1 = ct_1_data;
    step.ct = ct_data;
    step.ht = ht_data;
    if (attr.use_peephole) {
      step.wp = wp_data;
      step.checked = checked_data;
    }

    tgt(&step, &attr);
    ExpectEQ<T>(ct_data, ct_ref_data, d);
    ExpectEQ<T>(ht_data, ht_ref_data, d);
  }
};

template <typename T>
struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
                         std::vector<T>,
                         typename jit::GRUTuples<T>::attr_type> {
  void operator()(const typename jit::GRUTuples<T>::func_type tgt,
                  const std::vector<T>& xsrc, const std::vector<T>& ht_1,
                  const std::vector<T>& ht_ref,
                  const typename jit::GRUTuples<T>::attr_type& attr) {
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(ht_1.size(), ht_ref.size());
    EXPECT_EQ(xsrc.size(), 3 * ht_ref.size());

    // x could be changed after compute, so copy to save src
    int d = ht_ref.size();
    std::vector<T> x(xsrc.size()), ht(ht_ref.size());
    std::copy(xsrc.begin(), xsrc.end(), x.begin());
    const T* ht_1_data = ht_1.data();
    const T* ht_ref_data = ht_ref.data();
    T* x_data = x.data();
    T* ht_data = ht.data();
    jit::gru_t step;
    step.gates = x_data;
    step.ht_1 = ht_1_data;
    step.ht = ht_data;
    tgt(&step, &attr);
    ExpectEQ<T>(ht_data, ht_ref_data, d);
  }
};

template <typename T>
struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
                         typename jit::SeqPoolTuples<T>::attr_type> {
  void operator()(const typename jit::SeqPoolTuples<T>::func_type tgt,
                  const std::vector<T>& x, const std::vector<T>& yref,
                  const typename jit::SeqPoolTuples<T>::attr_type& attr) {
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
    int w = yref.size();
    std::vector<T> y(w);
    const T* x_data = x.data();
    const T* yref_data = yref.data();
    T* y_data = y.data();
    tgt(x_data, y_data, &attr);
    ExpectEQ<T>(y_data, yref_data, w);
  }
};

template <typename T>
struct TestFuncWithRefer<jit::EmbSeqPoolTuples<T>, std::vector<T>,
                         std::vector<int64_t>, std::vector<T>,
                         typename jit::EmbSeqPoolTuples<T>::attr_type> {
  void operator()(const typename jit::EmbSeqPoolTuples<T>::func_type tgt,
                  const std::vector<T>& table, const std::vector<int64_t>& idx,
                  const std::vector<T>& oref,
                  const typename jit::EmbSeqPoolTuples<T>::attr_type& attr) {
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(table.size(),
              static_cast<size_t>(attr.table_height * attr.table_width));
    EXPECT_EQ(idx.size(),
              static_cast<size_t>(attr.index_height * attr.index_width));
    EXPECT_EQ(oref.size(),
              static_cast<size_t>(attr.table_width * attr.index_width));
    const T* table_data = table.data();
    const int64_t* idx_data = idx.data();
    const T* oref_data = oref.data();
    int o_w = oref.size();
    std::vector<T> out(o_w);
    T* o_data = out.data();
    tgt(table_data, idx_data, o_data, &attr);
    ExpectEQ<T>(o_data, oref_data, o_w);
  }
};

template <typename T>
struct TestFuncWithRefer<jit::SgdTuples<T>, T, std::vector<T>, std::vector<T>,
                         std::vector<int64_t>, std::vector<T>,
                         typename jit::SgdTuples<T>::attr_type> {
  void operator()(const typename jit::SgdTuples<T>::func_type tgt, const T lr,
                  const std::vector<T>& param, const std::vector<T>& grad,
                  const std::vector<int64_t>& rows, const std::vector<T>& oref,
                  const typename jit::SgdTuples<T>::attr_type& attr) {
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(param.size(),
              static_cast<size_t>(attr.param_height * attr.param_width));
    EXPECT_EQ(grad.size(),
              static_cast<size_t>(attr.grad_height * attr.grad_width));
    EXPECT_EQ(rows.size(), static_cast<size_t>(attr.selected_rows_size));
    EXPECT_EQ(param.size(), oref.size());
    const T* param_data = param.data();
    const T* grad_data = grad.data();
    const int64_t* rows_data = rows.data();
    const T* oref_data = oref.data();

    std::vector<T> out(oref.size());
    T* o_data = out.data();
    tgt(&lr, param_data, grad_data, rows_data, o_data, &attr);
    // only the selected rows should be equal
    for (size_t i = 0; i < rows.size(); ++i) {
      ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
                  oref_data + rows[i] * attr.grad_width, attr.grad_width);
    }

    // inplace
    std::copy(param.begin(), param.end(), out.begin());
    tgt(&lr, o_data, grad_data, rows_data, o_data, &attr);
    for (size_t i = 0; i < rows.size(); ++i) {
      ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
                  oref_data + rows[i] * attr.grad_width, attr.grad_width);
    }
  }
};

template <typename T>
struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
                         std::vector<T>,
                         typename jit::MatMulTuples<T>::attr_type> {
  void operator()(const typename jit::MatMulTuples<T>::func_type tgt,
                  const std::vector<T>& a, const std::vector<T>& b,
                  const std::vector<T>& cref,
                  const typename jit::MatMulTuples<T>::attr_type& attr) {
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
    EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
    EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
    std::vector<T> c(cref.size());
    const T* a_data = a.data();
    const T* b_data = b.data();
    const T* cref_data = cref.data();
    T* c_data = c.data();
    tgt(a_data, b_data, c_data, &attr);
    ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
  }
};

template <typename T>
struct TestFuncWithRefer<jit::LayerNormTuples<T>, std::vector<T>,
                         std::vector<T>, std::vector<T>, std::vector<T>,
                         std::vector<T>, std::vector<T>, int, float, int> {
  void operator()(const typename jit::LayerNormTuples<T>::func_type tgt,
                  std::vector<T>& x, std::vector<T>& outref,  // NOLINT
                  std::vector<T>& mean, std::vector<T>& var,  // NOLINT
                  const std::vector<T>& scale, const std::vector<T>& bias,
                  int left, const float epsilon, int right) {
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
    EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
    EXPECT_EQ(mean.size(), static_cast<size_t>(left));
    EXPECT_EQ(var.size(), static_cast<size_t>(left));
    EXPECT_EQ(scale.size(), static_cast<size_t>(right));
    EXPECT_EQ(bias.size(), static_cast<size_t>(right));
    std::vector<T> outtgt(outref.size());
    const T* scale_data = scale.data();
    const T* bias_data = bias.data();
    T* x_data = x.data();
    T* mean_data = mean.data();
    T* var_data = var.data();
    T* outref_data = outref.data();
    T* outtgt_data = outtgt.data();

    tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left,
        epsilon, right);
    ExpectEQ<T>(outtgt_data, outref_data, left * right);
  }
};

template <typename T>
struct TestFuncWithRefer<jit::CRFDecodingTuples<T>, int, std::vector<T>,
                         std::vector<T>, std::vector<T>, std::vector<int>,
                         int> {
  void operator()(const typename jit::CRFDecodingTuples<T>::func_type tgt,
                  const int seq_len, const std::vector<T>& x,
                  const std::vector<T>& w, std::vector<T>& alpharef,  // NOLINT
                  std::vector<int>& trackref, int tag_num) {          // NOLINT
    constexpr int state_trans_base_idx = 2;
    EXPECT_TRUE(tgt != nullptr);
    EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
    EXPECT_EQ(w.size(),
              static_cast<size_t>((tag_num + state_trans_base_idx) * tag_num));
    EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num));
    EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num));
    std::vector<T> alphatgt(alpharef.size());
    std::vector<int> tracktgt(trackref.size());

    memcpy(trackref.data(), tracktgt.data(), tag_num * sizeof(int));
    tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(),
        tracktgt.data(), tag_num);
    ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num);
    ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num);
  }
};

template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
          typename... Args>
void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
  TestFuncWithRefer<KernelTuples, Args...> test;
  // test jitcode
  auto jitcode = jit::GetJitCode<KT, KernelTuples, PlaceType>(attr);
  if (jitcode) {
    VLOG(10) << "Test Jitcode Kernel ";
    test(jitcode, args...);
  }
  // test all impls in more
  jit::KernelKey kkey(KT, PlaceType());
  auto& pool = jit::KernelPool().Instance().AllKernels();
  auto iter = pool.find(kkey);
  if (iter != pool.end()) {
    auto& impls = iter->second;
    for (auto& impl : impls) {
      auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
      if (i && i->UseMe(attr)) {
        auto more = i->GetFunc();
        VLOG(10) << "Test More Kernel : " << i->ImplType();
        test(more, args...);
      }
    }
  }
  // test result from Get function
  // VLOG(10) << "Test Get function ";
  auto tgt = jit::Get<KT, KernelTuples, PlaceType>(attr);
  test(tgt, args...);
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelXYZNTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  for (int d : TestSizes()) {
    auto ref = jit::GetRefer<KT, jit::XYZNTuples<T>>();
    EXPECT_TRUE(ref != nullptr);

    std::vector<T> x(d), y(d), zref(d);
    RandomVec<T>(d, x.data());
    RandomVec<T>(d, y.data());

    std::vector<T> xinp(d), yinp(d);  // inplace test
    std::copy(x.begin(), x.end(), xinp.begin());
    std::copy(y.begin(), y.end(), yinp.begin());

    const T* x_data = x.data();
    const T* y_data = y.data();
    T* zref_data = zref.data();
    T* xinp_data = xinp.data();
    T* yinp_data = yinp.data();

    // test refer code inplace
    ref(x_data, y_data, zref_data, d);
    ref(x_data, yinp_data, yinp_data, d);
    ref(xinp_data, y_data, xinp_data, d);
    ExpectEQ<T>(xinp_data, zref_data, d);
    ExpectEQ<T>(yinp_data, zref_data, d);

    TestAllImpls<KT, jit::XYZNTuples<T>, PlaceType, std::vector<T>,
                 std::vector<T>, std::vector<T>>(d, x, y, zref);
  }
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelAXYNTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  for (int d : TestSizes()) {
    auto ref = jit::GetRefer<KT, jit::AXYNTuples<T>>();
    EXPECT_TRUE(ref != nullptr);

    const T a = static_cast<T>(3);
    std::vector<T> x(d), yref(d);
    std::vector<T> xinp(d);  // inplace test
    RandomVec<T>(d, x.data());
    std::copy(x.begin(), x.end(), xinp.begin());

    const T* x_data = x.data();
    T* yref_data = yref.data();
    T* xinp_data = xinp.data();
    // test refer code inplace
    ref(&a, x_data, yref_data, d);
    ref(&a, xinp_data, xinp_data, d);
    ExpectEQ<T>(xinp_data, yref_data, d);

    TestAllImpls<KT, jit::AXYNTuples<T>, PlaceType, T, std::vector<T>,
                 std::vector<T>>(d, a, x, yref);
  }
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelXRNTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  auto last_acc = FLAGS_acc;
  FLAGS_acc = 1e-4;
  for (int d : TestSizes()) {
    auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>();
    EXPECT_TRUE(ref != nullptr);
    std::vector<T> x(d);
    RandomVec<T>(d, x.data());
    T ref_res;
    ref(x.data(), &ref_res, d);
    TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x,
                                                                      ref_res);
  }
  FLAGS_acc = last_acc;
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelXYNTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  for (int d : TestSizes()) {
    auto ref = jit::GetRefer<KT, jit::XYNTuples<T>>();
    EXPECT_TRUE(ref != nullptr);

    std::vector<T> x(d), yref(d);
    std::vector<T> xinp(d);  // inplace test
    RandomVec<T>(d, x.data());
    std::copy(x.begin(), x.end(), xinp.begin());

    const T* x_data = x.data();
    T* yref_data = yref.data();
    T* xinp_data = xinp.data();
    // test refer code inplace
    ref(x_data, yref_data, d);
    ref(xinp_data, xinp_data, d);
    ExpectEQ<T>(xinp_data, yref_data, d);

    TestAllImpls<KT, jit::XYNTuples<T>, PlaceType, std::vector<T>,
                 std::vector<T>>(d, x, yref);
  }
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelLSTMTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
  auto test_sizes = TestSizes();
  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
  for (int d : test_sizes) {
    for (bool use_peephole : {true, false}) {
      for (auto& act_gate : all_acts) {
        for (auto& act_cand : all_acts) {
          for (auto& act_cell : all_acts) {
            const jit::lstm_attr_t attr(
                d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand),
                jit::to_kerneltype(act_cell), use_peephole);
            auto ref = jit::GetRefer<KT, jit::LSTMTuples<T>>();
            EXPECT_TRUE(ref != nullptr);
            std::vector<T> xsrc(4 * d), wp(3 * d), ct_1(d);
            std::vector<T> ct_ref(d), ht_ref(d), checked(2 * d);
            RandomVec<T>(4 * d, xsrc.data());
            RandomVec<T>(3 * d, wp.data(), -1.f, 1.f);
            RandomVec<T>(d, ct_1.data(), -1.f, 1.f);
            // x could be changed after compute, so copy to save src
            std::vector<T> x(xsrc.size());
            std::copy(xsrc.begin(), xsrc.end(), x.begin());
            const T* ct_1_data = ct_1.data();
            const T* wp_data = wp.data();
            T* x_data = x.data();
            T* checked_data = checked.data();
            T* ct_ref_data = ct_ref.data();
            T* ht_ref_data = ht_ref.data();
            jit::lstm_t step;
            step.gates = x_data;
            step.ct_1 = ct_1_data;
            step.ct = ct_ref_data;
            step.ht = ht_ref_data;
            if (use_peephole) {
              step.wp = wp_data;
              step.checked = checked_data;
            }
            ref(&step, &attr);
            VLOG(10) << attr;
            TestAllImpls<KT, jit::LSTMTuples<T>, PlaceType, std::vector<T>,
                         std::vector<T>, std::vector<T>, std::vector<T>,
                         std::vector<T>>(attr, xsrc, wp, ct_1, ct_ref, ht_ref,
                                         attr);
          }
        }
      }
    }
  }
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelGRUTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
  auto test_sizes = TestSizes();
  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
  for (int d : test_sizes) {
    for (auto& act_gate : all_acts) {
      for (auto& act_cand : all_acts) {
        const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate),
                                   jit::to_kerneltype(act_cand));
        auto ref = jit::GetRefer<KT, jit::GRUTuples<T>>();
        EXPECT_TRUE(ref != nullptr);
        std::vector<T> xsrc(3 * d), ht_1(d), ht_ref(d);
        RandomVec<T>(3 * d, xsrc.data());
        RandomVec<T>(d, ht_1.data());
        // x could be changed after compute, so copy to save src
        std::vector<T> x(xsrc.size());
        std::copy(xsrc.begin(), xsrc.end(), x.begin());
        const T* ht_1_data = ht_1.data();
        T* x_data = x.data();
        T* ht_ref_data = ht_ref.data();
        jit::gru_t step;
        step.gates = x_data;
        step.ht_1 = ht_1_data;
        step.ht = ht_ref_data;
        ref(&step, &attr);
        VLOG(10) << attr;
        TestAllImpls<KT, jit::GRUTuples<T>, PlaceType, std::vector<T>,
                     std::vector<T>, std::vector<T>>(attr, xsrc, ht_1, ht_ref,
                                                     attr);
      }
    }
  }
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelSeqPoolTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  std::vector<jit::SeqPoolType> pool_types = {
      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
  auto test_sizes = TestSizes();
  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
  for (auto type : pool_types) {
    for (int w : test_sizes) {
      jit::seq_pool_attr_t attr(w, type);
      for (int h : test_sizes) {
        attr.h = h;
        auto ref = jit::GetRefer<KT, jit::SeqPoolTuples<T>>();
        EXPECT_TRUE(ref != nullptr);
        std::vector<T> x(h * w), yref(w);
        RandomVec<T>(h * w, x.data());
        const T* x_data = x.data();
        T* yref_data = yref.data();
        ref(x_data, yref_data, &attr);
        VLOG(10) << attr;
        TestAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType, std::vector<T>,
                     std::vector<T>>(attr, x, yref, attr);
      }
    }
  }
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelMatMulTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  auto last_acc = FLAGS_acc;
  // export MKL_CBWR=AVX would make MKL force to use AVX
  // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic
  FLAGS_acc = 1e-3;
  for (int m : {1, 2, 3, 4}) {
    for (int n : {1, 2, 3, 4}) {
      for (int k : TestSizes()) {
        auto ref = jit::GetRefer<KT, jit::MatMulTuples<T>>();
        EXPECT_TRUE(ref != nullptr);
        std::vector<T> a(m * k), b(k * n), c(m * n);
        RandomVec<T>(m * k, a.data());
        RandomVec<T>(k * n, b.data());
        const T* a_data = a.data();
        const T* b_data = b.data();
        T* c_data = c.data();
        const jit::matmul_attr_t attr{m, n, k};
        ref(a_data, b_data, c_data, &attr);
        TestAllImpls<KT, jit::MatMulTuples<T>, PlaceType, std::vector<T>,
                     std::vector<T>, std::vector<T>>(attr, a, b, c, attr);
      }
    }
  }
  FLAGS_acc = last_acc;
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelSoftmaxTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  for (int bs : {1, 2, 10}) {
    for (int n : TestSizes()) {
      auto ref = jit::GetRefer<KT, jit::SoftmaxTuples<T>>();
      EXPECT_TRUE(ref != nullptr);
      std::vector<T> x(bs * n), y(bs * n);
      RandomVec<T>(bs * n, x.data());
      const T* x_data = x.data();
      T* y_data = y.data();

      std::vector<T> xinp(x.size());  // inplace test
      std::copy(x.begin(), x.end(), xinp.begin());
      ref(x_data, y_data, n, bs);
      T* xinp_data = xinp.data();
      ref(xinp_data, xinp_data, n, bs);
      ExpectEQ<T>(xinp_data, y_data, n * bs);

      TestAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType, std::vector<T>,
                   std::vector<T>>(n, x, y, n, bs);
    }
  }
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelEmbSeqPoolTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  int64_t tbl_h = 1e4;
  std::vector<jit::SeqPoolType> pool_types = {
      jit::SeqPoolType::kSum};  // only support sum yet
  auto test_sizes = TestSizes();
  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
  for (int tbl_w : test_sizes) {
    std::vector<T> table(tbl_h * tbl_w);
    RandomVec<T>(tbl_h * tbl_w, table.data());
    const T* table_data = table.data();
    for (auto type : pool_types) {
      for (int idx_w : {1, 2, 10, 16}) {
        for (int idx_h : {1, 2, 9, 13, 16}) {
          auto ref = jit::GetRefer<KT, jit::EmbSeqPoolTuples<T>>();
          EXPECT_TRUE(ref != nullptr);
          std::vector<int64_t> idx(idx_h * idx_w);
          RandomVec<int64_t>(idx_h * idx_w, idx.data(), 0, tbl_h - 1);
          int64_t out_w = tbl_w * idx_w;
          std::vector<T> oref(out_w);
          const int64_t* idx_data = idx.data();
          T* o_data = oref.data();
          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
                                        type);
          ref(table_data, idx_data, o_data, &attr);

          TestAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType, std::vector<T>,
                       std::vector<int64_t>, std::vector<T>>(attr, table, idx,
                                                             oref, attr);
        }
      }
    }
  }
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelSgdTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  const T lr = 0.1;
  auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                  const int64_t upper) -> std::vector<int64_t> {
    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
    PADDLE_ENFORCE_GT(n, 0);
    std::vector<int64_t> all, out;
    for (int i = 0; i < n; ++i) {
      all.push_back(i);
    }
    std::random_shuffle(all.begin(), all.end());
    out.insert(out.begin(), all.begin(), all.begin() + n);
    return out;
  };
  for (int param_h : {1, 10}) {
    for (int grad_w : TestSizes()) {
      std::vector<T> param(param_h * grad_w);
      std::vector<T> param_out(param_h * grad_w);
      RandomVec<T>(param_h * grad_w, param.data());
      const T* param_data = param.data();
      T* out_data = param_out.data();
      for (int rows_size = 1; rows_size <= param_h; ++rows_size) {
        std::vector<T> grad(rows_size * grad_w);
        std::vector<int64_t> rows =
            UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
        RandomVec<T>(rows_size * grad_w, grad.data());
        const int64_t* rows_data = rows.data();
        const T* grad_data = grad.data();
        auto ref = jit::GetRefer<KT, jit::SgdTuples<T>>();
        EXPECT_TRUE(ref != nullptr);
        jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
        ref(&lr, param_data, grad_data, rows_data, out_data, &attr);

        // inplace test
        std::vector<T> inp(param.size());
        std::copy(param.begin(), param.end(), inp.begin());
        T* inp_data = inp.data();
        ref(&lr, inp_data, grad_data, rows_data, inp_data, &attr);
        // only the selected rows should be equal
        for (int i = 0; i < rows_size; ++i) {
          ExpectEQ<T>(inp_data + rows[i] * grad_w, out_data + rows[i] * grad_w,
                      grad_w);
        }

        TestAllImpls<KT, jit::SgdTuples<T>, PlaceType, T, std::vector<T>,
                     std::vector<T>, std::vector<int64_t>, std::vector<T>>(
            attr, lr, param, grad, rows, param_out, attr);
      }
    }
  }
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelNCHW16CMulNCTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  const int n = 3, c = 16 * 4, h = 10, w = 10;
  auto ref = jit::GetRefer<KT, jit::NCHW16CMulNCTuples<T>>();
  EXPECT_TRUE(ref != nullptr);
  int sz = n * c * h * w;
  std::vector<T> x(sz), y(n * c), zref(sz);
  std::vector<T> ztgt(sz), zjit(sz);
  RandomVec<T>(sz, x.data());
  RandomVec<T>(n * c, y.data());

  const T* x_data = x.data();
  const T* y_data = y.data();
  T* zref_data = zref.data();
  T* ztgt_data = ztgt.data();
  T* zjit_data = zjit.data();
  constexpr int simd_width = ZMM_FLOAT_BLOCK;
  int C = c / simd_width;
  auto tgt = jit::Get<KT, jit::NCHW16CMulNCTuples<T>, PlaceType>(0);
  auto jitcode = jit::GetJitCode<KT, jit::NCHW16CMulNCTuples<T>, PlaceType>(0);
  EXPECT_TRUE(tgt != nullptr);

  if (std::is_same<T, float>::value &&
      paddle::platform::MayIUse(paddle::platform::avx512f)) {
    EXPECT_TRUE(jitcode != nullptr);
  }
  for (int ni = 0; ni < n; ni++) {
    for (int ci = 0; ci < C; ci++) {
      auto ptr_x =
          x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
      auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
      auto ptr_zref =
          zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
      auto ptr_ztgt =
          ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width;

      ref(ptr_x, ptr_y, ptr_zref, h, w);
      tgt(ptr_x, ptr_y, ptr_ztgt, h, w);

      if (jitcode) {
        auto ptr_zjit =
            zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
        jitcode(ptr_x, ptr_y, ptr_zjit, h, w);
      }
    }
  }
  ExpectEQ<T>(ztgt_data, zref_data, sz);
  if (jitcode) {
    ExpectEQ<T>(zjit_data, zref_data, sz);
  }
}

template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void TestKernelLayerNormTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  const T epsilon = 9.99999975e-06;
  for (int n : {1, 2, 10}) {
    for (int x_dim_0 : {1, 9, 17, 50}) {
      int left = n * x_dim_0;
      for (int x_dim_1 : TestSizes()) {
        int right = x_dim_1;
        auto ref = jit::GetRefer<KT, jit::LayerNormTuples<T>>();
        EXPECT_TRUE(ref != nullptr);
        int sz = left * right;
        std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
            outref(sz);
        RandomVec<T>(sz, x.data());
        RandomVec<T>(left, mean.data());
        RandomVec<T>(left, var.data());
        RandomVec<T>(right, scale.data());
        RandomVec<T>(right, bias.data());

        const T* scale_data = scale.data();
        const T* bias_data = bias.data();
        T* x_data = x.data();
        T* mean_data = mean.data();
        T* var_data = var.data();
        T* outref_data = outref.data();

        ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
            left, epsilon, right);

        TestAllImpls<KT, jit::LayerNormTuples<T>, PlaceType, std::vector<T>,
                     std::vector<T>, std::vector<T>, std::vector<T>,
                     std::vector<T>, std::vector<T>, int, float>(
            right, x, outref, mean, var, scale, bias, left, epsilon, right);
      }
    }
  }
}

template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void TestKernelCRFDecodingTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  constexpr int state_trans_base_idx = 2;
  auto test_sizes = TestSizes();
  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000));
  for (int seq_len : {1, 11, 17, 50}) {
    for (int tag_num : test_sizes) {
      auto ref = jit::GetRefer<KT, jit::CRFDecodingTuples<T>>();
      EXPECT_TRUE(ref != nullptr);
      int x_sz = seq_len * tag_num;
      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
      std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
      std::vector<int> trackref(x_sz);
      RandomVec<T>(x_sz, x.data());
      RandomVec<T>(w_sz, w.data());

      ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
          trackref.data(), tag_num);

      TestAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType, int,
                   std::vector<T>, std::vector<T>, std::vector<T>,
                   std::vector<int>, int>(tag_num, seq_len, x, w, alpharef,
                                          trackref, tag_num);
    }
  }
}

template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelVBroadcastTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  for (int w : TestSizes()) {
    std::vector<T> x(w);
    RandomVec<T>(w, x.data());
    const T* x_data = x.data();
    for (int64_t h : {1, 2, 6}) {
      auto ref = jit::GetRefer<KT, jit::VBroadcastTuples<T>>();
      EXPECT_TRUE(ref != nullptr);
      std::vector<T> y(w * h);
      T* y_data = y.data();
      ref(x_data, y_data, h, w);

      TestAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType, std::vector<T>,
                   std::vector<T>, int64_t>(static_cast<int64_t>(w), x, y, h,
                                            static_cast<int64_t>(w));
    }
  }
}

#define TEST_CPU_KERNEL(test_tuple, kernel_type)                 \
  TEST(JITKernel, kernel_type) {                                 \
    TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
    TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
  }

TEST_CPU_KERNEL(XYZNTuples, kVMul);
TEST_CPU_KERNEL(XYZNTuples, kVAdd);
TEST_CPU_KERNEL(XYZNTuples, kVAddRelu);
TEST_CPU_KERNEL(XYZNTuples, kVSub);

TEST_CPU_KERNEL(AXYNTuples, kVScal);
TEST_CPU_KERNEL(AXYNTuples, kVAddBias);

TEST_CPU_KERNEL(XRNTuples, kHMax);
TEST_CPU_KERNEL(XRNTuples, kHSum);

TEST_CPU_KERNEL(XYNTuples, kVRelu);
TEST_CPU_KERNEL(XYNTuples, kVIdentity);
TEST_CPU_KERNEL(XYNTuples, kVSquare);
TEST_CPU_KERNEL(XYNTuples, kVExp);
TEST_CPU_KERNEL(XYNTuples, kVSigmoid);
TEST_CPU_KERNEL(XYNTuples, kVTanh);
TEST_CPU_KERNEL(XYNTuples, kVCopy);

TEST_CPU_KERNEL(LSTMTuples, kLSTMCtHt);
TEST_CPU_KERNEL(LSTMTuples, kLSTMC1H1);

TEST_CPU_KERNEL(GRUTuples, kGRUH1);
TEST_CPU_KERNEL(GRUTuples, kGRUHtPart1);
TEST_CPU_KERNEL(GRUTuples, kGRUHtPart2);

TEST_CPU_KERNEL(NCHW16CMulNCTuples, kNCHW16CMulNC);

TEST_CPU_KERNEL(SeqPoolTuples, kSeqPool);
TEST_CPU_KERNEL(MatMulTuples, kMatMul);
TEST_CPU_KERNEL(SoftmaxTuples, kSoftmax);
TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool);
TEST_CPU_KERNEL(SgdTuples, kSgd);
TEST_CPU_KERNEL(LayerNormTuples, kLayerNorm);
TEST_CPU_KERNEL(CRFDecodingTuples, kCRFDecoding);
TEST_CPU_KERNEL(VBroadcastTuples, kVBroadcast);

TEST(JITKernel_key, lstm) {
  jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
  jit::lstm_attr_t attr2(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
  jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
  jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh);

  auto key1 = jit::JitCodeKey<jit::lstm_attr_t>(attr1);
  auto key2 = jit::JitCodeKey<jit::lstm_attr_t>(attr2);
  auto key3 = jit::JitCodeKey<jit::lstm_attr_t>(attr3);
  auto key4 = jit::JitCodeKey<jit::lstm_attr_t>(attr4);

  EXPECT_TRUE(key1 != key2);
  EXPECT_TRUE(key2 == key3);
  EXPECT_TRUE(key3 != key4);
}

TEST(JITKernel_key, gru) {
  jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh);
  jit::gru_attr_t attr2(9, jit::kVSigmoid, jit::kVTanh);
  jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh);
  jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity);

  auto key1 = jit::JitCodeKey<jit::gru_attr_t>(attr1);
  auto key2 = jit::JitCodeKey<jit::gru_attr_t>(attr2);
  auto key3 = jit::JitCodeKey<jit::gru_attr_t>(attr3);
  auto key4 = jit::JitCodeKey<jit::gru_attr_t>(attr4);

  EXPECT_TRUE(key1 != key2);
  EXPECT_TRUE(key2 == key3);
  EXPECT_TRUE(key3 != key4);
}
// TODO(TJ): add more test about key and pool