/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/phi/backends/cpu/cpu_info.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/funcs/jit/kernels.h" DEFINE_double(acc, 1e-5, "Test accuracy threshold."); template void RandomVec(const int n, T* a, const T lower = static_cast(-2.f), const T upper = static_cast(2.f)) { static unsigned int seed = 100; std::mt19937 rng(seed++); std::uniform_real_distribution uniform_dist(0, 1); for (int i = 0; i < n; ++i) { a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); } } template void ExpectEQ(const T* target, const T* refer, size_t n) { if (std::is_floating_point::value) { for (size_t i = 0; i < n; ++i) { EXPECT_NEAR(target[i], refer[i], FLAGS_acc) << " at index : " << i; } } else { for (size_t i = 0; i < n; ++i) { EXPECT_EQ(target[i], refer[i]) << " at index : " << i; } } } std::vector TestSizes() { std::vector s; for (int i = 1; i < 32; ++i) { s.push_back(i); } // test some large size s.push_back(100); s.push_back(1000); s.push_back(2000); return s; } namespace jit = phi::jit; using CPUPlace = phi::CPUPlace; template void TestAllImpls(const typename KernelTuple::attr_type& attr, const Tester& verifier, const Args&... args) { auto funcs = jit::GetAllCandidateFuncsWithTypes(attr); for (auto f : funcs) { VLOG(10) << "Test Kernel " << f.first; verifier(f.second, args...); } } template void TestKernelXYZN() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(d), y(d), zref(d); RandomVec(d, x.data()); RandomVec(d, y.data()); std::vector xinp(d), yinp(d); // inplace test std::copy(x.begin(), x.end(), xinp.begin()); std::copy(y.begin(), y.end(), yinp.begin()); const T* x_data = x.data(); const T* y_data = y.data(); T* zref_data = zref.data(); T* xinp_data = xinp.data(); T* yinp_data = yinp.data(); // test refer code inplace ref(x_data, y_data, zref_data, d); ref(x_data, yinp_data, yinp_data, d); ref(xinp_data, y_data, xinp_data, d); ExpectEQ(xinp_data, zref_data, d); ExpectEQ(yinp_data, zref_data, d); auto verifier = [](const typename KernelTuple::func_type tgt, const std::vector& x, const std::vector& y, const std::vector& zref) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(zref.size(), x.size()); EXPECT_EQ(zref.size(), y.size()); const T* x_data = x.data(); const T* y_data = y.data(); const T* zref_data = zref.data(); const int d = zref.size(); std::vector ztgt(d); T* ztgt_data = ztgt.data(); // test normal tgt(x_data, y_data, ztgt_data, d); ExpectEQ(ztgt_data, zref_data, d); // test inplace x std::copy(x.begin(), x.end(), ztgt.begin()); tgt(ztgt_data, y_data, ztgt_data, d); ExpectEQ(ztgt_data, zref_data, d); // test inplace y std::copy(y.begin(), y.end(), ztgt.begin()); tgt(x_data, ztgt_data, ztgt_data, d); ExpectEQ(ztgt_data, zref_data, d); }; TestAllImpls(d, verifier, x, y, zref); } } template void TestKernelAXYN() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); const T a = static_cast(3); std::vector x(d), yref(d); std::vector xinp(d); // inplace test RandomVec(d, x.data()); std::copy(x.begin(), x.end(), xinp.begin()); const T* x_data = x.data(); T* yref_data = yref.data(); T* xinp_data = xinp.data(); // test refer code inplace ref(&a, x_data, yref_data, d); ref(&a, xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); auto verifier = [](const typename KernelTuple::func_type tgt, const T a, const std::vector& x, const std::vector& yref) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(yref.size(), x.size()); const T* x_data = x.data(); const T* yref_data = yref.data(); const int d = yref.size(); std::vector ytgt(d); T* ytgt_data = ytgt.data(); // test normal tgt(&a, x_data, ytgt_data, d); ExpectEQ(ytgt_data, yref_data, d); // test inplace x std::copy(x.begin(), x.end(), ytgt.begin()); tgt(&a, ytgt_data, ytgt_data, d); ExpectEQ(ytgt_data, yref_data, d); }; TestAllImpls(d, verifier, a, x, yref); } } template void TestKernelXYN() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(d), yref(d); std::vector xinp(d); // inplace test RandomVec(d, x.data()); std::copy(x.begin(), x.end(), xinp.begin()); const T* x_data = x.data(); T* yref_data = yref.data(); T* xinp_data = xinp.data(); // test refer code inplace ref(x_data, yref_data, d); ref(xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); auto verifier = [](const typename KernelTuple::func_type tgt, const std::vector& x, const std::vector& yref) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(yref.size(), x.size()); const T* x_data = x.data(); const T* yref_data = yref.data(); const int d = yref.size(); std::vector ytgt(d); T* ytgt_data = ytgt.data(); // test normal tgt(x_data, ytgt_data, d); ExpectEQ(ytgt_data, yref_data, d); // test inplace x std::copy(x.begin(), x.end(), ytgt.begin()); tgt(ytgt_data, ytgt_data, d); ExpectEQ(ytgt_data, yref_data, d); }; TestAllImpls(d, verifier, x, yref); } } template void TestKernelLSTM() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; auto test_sizes = TestSizes(); test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); for (int d : test_sizes) { for (bool use_peephole : {true, false}) { for (auto& act_gate : all_acts) { for (auto& act_cand : all_acts) { for (auto& act_cell : all_acts) { const jit::lstm_attr_t attr(d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand), jit::to_kerneltype(act_cell), use_peephole); auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(4 * d), wp(3 * d), ct_1(d); std::vector ct_ref(d), ht_ref(d), checked(2 * d); RandomVec(4 * d, xsrc.data()); RandomVec(3 * d, wp.data(), -1.f, 1.f); RandomVec(d, ct_1.data(), -1.f, 1.f); // x could be changed after compute, so copy to save src std::vector x(xsrc.size()); std::copy(xsrc.begin(), xsrc.end(), x.begin()); const T* ct_1_data = ct_1.data(); const T* wp_data = wp.data(); T* x_data = x.data(); T* checked_data = checked.data(); T* ct_ref_data = ct_ref.data(); T* ht_ref_data = ht_ref.data(); jit::lstm_t step; step.gates = x_data; step.ct_1 = ct_1_data; step.ct = ct_ref_data; step.ht = ht_ref_data; if (use_peephole) { step.wp = wp_data; step.checked = checked_data; } ref(&step, &attr); VLOG(10) << attr; auto verifier = [](const typename KernelTuple::func_type tgt, const std::vector& xsrc, const std::vector& wp, const std::vector& ct_1, const std::vector& ct_ref, const std::vector& ht_ref, const typename KernelTuple::attr_type& attr) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(ct_ref.size(), ht_ref.size()); EXPECT_EQ(ct_1.size(), ht_ref.size()); EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); EXPECT_EQ(wp.size(), 3 * ht_ref.size()); // x could be changed after compute, so copy to save src int d = ht_ref.size(); std::vector x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size()); std::vector checked(2 * d); std::copy(xsrc.begin(), xsrc.end(), x.begin()); const T* ct_1_data = ct_1.data(); const T* wp_data = wp.data(); const T* ct_ref_data = ct_ref.data(); const T* ht_ref_data = ht_ref.data(); T* x_data = x.data(); T* ct_data = ct.data(); T* ht_data = ht.data(); T* checked_data = checked.data(); jit::lstm_t step; step.gates = x_data; step.ct_1 = ct_1_data; step.ct = ct_data; step.ht = ht_data; if (attr.use_peephole) { step.wp = wp_data; step.checked = checked_data; } tgt(&step, &attr); ExpectEQ(ct_data, ct_ref_data, d); ExpectEQ(ht_data, ht_ref_data, d); }; TestAllImpls( attr, verifier, xsrc, wp, ct_1, ct_ref, ht_ref, attr); } } } } } } template void TestKernelGRU() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; auto test_sizes = TestSizes(); test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); for (int d : test_sizes) { for (auto& act_gate : all_acts) { for (auto& act_cand : all_acts) { const jit::gru_attr_t attr( d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand)); auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(3 * d), ht_1(d), ht_ref(d); RandomVec(3 * d, xsrc.data()); RandomVec(d, ht_1.data()); // x could be changed after compute, so copy to save src std::vector x(xsrc.size()); std::copy(xsrc.begin(), xsrc.end(), x.begin()); const T* ht_1_data = ht_1.data(); T* x_data = x.data(); T* ht_ref_data = ht_ref.data(); jit::gru_t step; step.gates = x_data; step.ht_1 = ht_1_data; step.ht = ht_ref_data; ref(&step, &attr); VLOG(10) << attr; auto verifier = [](const typename KernelTuple::func_type tgt, const std::vector& xsrc, const std::vector& ht_1, const std::vector& ht_ref, const typename KernelTuple::attr_type& attr) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(ht_1.size(), ht_ref.size()); EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); // x could be changed after compute, so copy to save src int d = ht_ref.size(); std::vector x(xsrc.size()), ht(ht_ref.size()); std::copy(xsrc.begin(), xsrc.end(), x.begin()); const T* ht_1_data = ht_1.data(); const T* ht_ref_data = ht_ref.data(); T* x_data = x.data(); T* ht_data = ht.data(); jit::gru_t step; step.gates = x_data; step.ht_1 = ht_1_data; step.ht = ht_data; tgt(&step, &attr); ExpectEQ(ht_data, ht_ref_data, d); }; TestAllImpls( attr, verifier, xsrc, ht_1, ht_ref, attr); } } } } template void TestKernelLayerNorm() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); const T epsilon = 9.99999975e-06; for (int n : {1, 2, 10}) { for (int x_dim_0 : {1, 9, 17, 50}) { int left = n * x_dim_0; for (int x_dim_1 : TestSizes()) { int right = x_dim_1; auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); int sz = left * right; std::vector x(sz), mean(left), var(left), scale(right), bias(right), outref(sz); RandomVec(sz, x.data()); RandomVec(left, mean.data()); RandomVec(left, var.data()); RandomVec(right, scale.data()); RandomVec(right, bias.data()); const T* scale_data = scale.data(); const T* bias_data = bias.data(); T* x_data = x.data(); T* mean_data = mean.data(); T* var_data = var.data(); T* outref_data = outref.data(); ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data, left, epsilon, right); auto verifier = [](const typename KernelTuple::func_type tgt, const std::vector& x_, const std::vector& outref_, const std::vector& mean_, const std::vector& var_, const std::vector& scale, const std::vector& bias, const int& left, const float& epsilon, const typename KernelTuple::attr_type& right) { EXPECT_TRUE(tgt != nullptr); std::vector outtgt(outref_.size()); std::vector x(x_.size()); std::vector mean(mean_.size()); std::vector var(var_.size()); std::vector outref(outref_.size()); std::copy(x_.begin(), x_.end(), x.begin()); std::copy(mean_.begin(), mean_.end(), mean.begin()); std::copy(var_.begin(), var_.end(), var.begin()); std::copy(outref_.begin(), outref_.end(), outref.begin()); EXPECT_EQ(x.size(), static_cast(left * right)); EXPECT_EQ(outref.size(), static_cast(left * right)); EXPECT_EQ(mean.size(), static_cast(left)); EXPECT_EQ(var.size(), static_cast(left)); EXPECT_EQ(scale.size(), static_cast(right)); EXPECT_EQ(bias.size(), static_cast(right)); const T* scale_data = scale.data(); const T* bias_data = bias.data(); T* x_data = x.data(); T* mean_data = mean.data(); T* var_data = var.data(); T* outref_data = outref.data(); T* outtgt_data = outtgt.data(); tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left, epsilon, right); ExpectEQ(outtgt_data, outref_data, left * right); }; TestAllImpls(right, verifier, x, outref, mean, var, scale, bias, left, epsilon, right); } } } } template void TestKernelCRFDecoding() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); constexpr int state_trans_base_idx = 2; auto test_sizes = TestSizes(); test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000)); for (int seq_len : {1, 11, 17, 50}) { for (int tag_num : test_sizes) { auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); int x_sz = seq_len * tag_num; int w_sz = (tag_num + state_trans_base_idx) * tag_num; std::vector x(x_sz), w(w_sz), alpharef(x_sz); std::vector trackref(x_sz); RandomVec(x_sz, x.data()); RandomVec(w_sz, w.data()); ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(), trackref.data(), tag_num); auto verifier = [](const typename KernelTuple::func_type tgt, const int& seq_len, const std::vector& x, const std::vector& w, const std::vector& alpharef, const std::vector& trackref, const typename KernelTuple::attr_type& tag_num) { constexpr int state_trans_base_idx = 2; EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(x.size(), static_cast(seq_len * tag_num)); EXPECT_EQ( w.size(), static_cast((tag_num + state_trans_base_idx) * tag_num)); EXPECT_EQ(alpharef.size(), static_cast(seq_len * tag_num)); EXPECT_EQ(trackref.size(), static_cast(seq_len * tag_num)); std::vector alphatgt(alpharef.size()); std::vector tracktgt(trackref.size()); memcpy(tracktgt.data(), trackref.data(), tag_num * sizeof(int)); tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(), tracktgt.data(), tag_num); ExpectEQ(alpharef.data(), alphatgt.data(), seq_len * tag_num); ExpectEQ(trackref.data(), tracktgt.data(), seq_len * tag_num); }; TestAllImpls( tag_num, verifier, seq_len, x, w, alpharef, trackref, tag_num); } } } template void TestKernelSeqPool() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector pool_types = { jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; auto test_sizes = TestSizes(); test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); for (auto type : pool_types) { for (int w : test_sizes) { jit::seq_pool_attr_t attr(w, type); for (int h : test_sizes) { attr.h = h; auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(h * w), yref(w); RandomVec(h * w, x.data()); const T* x_data = x.data(); T* yref_data = yref.data(); ref(x_data, yref_data, &attr); VLOG(10) << attr; auto verifier = [](const typename KernelTuple::func_type tgt, const std::vector& x, const std::vector& yref, const typename KernelTuple::attr_type& attr) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(x.size() % yref.size(), static_cast(0)); int w = yref.size(); std::vector y(w); const T* x_data = x.data(); const T* yref_data = yref.data(); T* y_data = y.data(); tgt(x_data, y_data, &attr); ExpectEQ(y_data, yref_data, w); }; TestAllImpls(attr, verifier, x, yref, attr); } } } } template void TestKernelEmbSeqPool() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); int64_t tbl_h = 1e4; std::vector pool_types = { jit::SeqPoolType::kSum}; // only support sum yet auto test_sizes = TestSizes(); test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); for (int tbl_w : test_sizes) { std::vector table(tbl_h * tbl_w); RandomVec(tbl_h * tbl_w, table.data()); const T* table_data = table.data(); for (auto type : pool_types) { for (int idx_w : {1, 2, 10, 16}) { for (int idx_h : {1, 2, 9, 13, 16}) { auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector idx(idx_h * idx_w); RandomVec(idx_h * idx_w, idx.data(), 0, tbl_h - 1); int64_t out_w = tbl_w * idx_w; std::vector oref(out_w); const int64_t* idx_data = idx.data(); T* o_data = oref.data(); jit::emb_seq_pool_attr_t attr( tbl_h, tbl_w, idx_h, idx_w, out_w, type); ref(table_data, idx_data, o_data, &attr); auto verifier = [](const typename KernelTuple::func_type tgt, const std::vector& table, const std::vector& idx, const std::vector& oref, const typename KernelTuple::attr_type& attr) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ( table.size(), static_cast(attr.table_height * attr.table_width)); EXPECT_EQ( idx.size(), static_cast(attr.index_height * attr.index_width)); EXPECT_EQ(oref.size(), static_cast(attr.table_width * attr.index_width)); const T* table_data = table.data(); const int64_t* idx_data = idx.data(); const T* oref_data = oref.data(); int o_w = oref.size(); std::vector out(o_w); T* o_data = out.data(); tgt(table_data, idx_data, o_data, &attr); ExpectEQ(o_data, oref_data, o_w); }; TestAllImpls( attr, verifier, table, idx, oref, attr); } } } } } template void TestKernelMatMul() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); auto last_acc = FLAGS_acc; // export MKL_CBWR=AVX would make MKL force to use AVX // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic FLAGS_acc = 1e-3; for (int m : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) { for (int k : TestSizes()) { auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector a(m * k), b(k * n), c(m * n); RandomVec(m * k, a.data()); RandomVec(k * n, b.data()); const T* a_data = a.data(); const T* b_data = b.data(); T* c_data = c.data(); const jit::matmul_attr_t attr{m, n, k}; ref(a_data, b_data, c_data, &attr); auto verifier = [](const typename KernelTuple::func_type tgt, const std::vector& a, const std::vector& b, const std::vector& cref, const typename KernelTuple::attr_type& attr) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); std::vector c(cref.size()); const T* a_data = a.data(); const T* b_data = b.data(); const T* cref_data = cref.data(); T* c_data = c.data(); tgt(a_data, b_data, c_data, &attr); ExpectEQ(c_data, cref_data, attr.m * attr.n); }; TestAllImpls(attr, verifier, a, b, c, attr); } } } FLAGS_acc = last_acc; } template void TestKernelAdam() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); const T lr = 0.1; const T beta1 = 0.99; const T beta2 = 0.95; const T beta1_pow = beta1 * beta1; const T beta2_pow = beta2 * beta2; const T epsilon = 0.000001; const int64_t numel = 123; T learning_rate = lr * (sqrt(1 - beta2_pow) / (1 - beta1_pow)); T eps = epsilon * sqrt(1 - beta2_pow); std::vector param(numel); std::vector grad(numel); std::vector mom1(numel); std::vector mom2(numel); std::vector param_out(param.size()); std::vector mom1_out(mom1.size()); std::vector mom2_out(mom2.size()); RandomVec(numel, param.data(), 0.5f); RandomVec(numel, grad.data(), 0.5f); RandomVec(numel, mom1.data(), 0.5f); RandomVec(numel, mom2.data(), 0.5f); auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); jit::adam_attr_t attr(beta1, beta2); ref(beta1, beta2, -learning_rate, eps, numel, grad.data(), mom1.data(), mom2.data(), param.data(), mom1_out.data(), mom2_out.data(), param_out.data()); auto verifier = [](const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr, T eps, int64_t numel, const std::vector& grad, const std::vector& mom1, const std::vector& mom2, const std::vector& param, const std::vector& ref_mom1_out, const std::vector& ref_mom2_out, const std::vector& ref_param_out) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(param.size(), static_cast(numel)); EXPECT_EQ(grad.size(), static_cast(numel)); EXPECT_EQ(mom1.size(), static_cast(numel)); EXPECT_EQ(mom2.size(), static_cast(numel)); std::vector jit_mom1_out(ref_mom1_out.size()); std::vector jit_mom2_out(ref_mom2_out.size()); std::vector jit_param_out(ref_param_out.size()); tgt(beta1, beta2, -lr, eps, numel, grad.data(), mom1.data(), mom2.data(), param.data(), jit_mom1_out.data(), jit_mom2_out.data(), jit_param_out.data()); ExpectEQ(ref_mom1_out.data(), jit_mom1_out.data(), numel); ExpectEQ(ref_mom2_out.data(), jit_mom2_out.data(), numel); ExpectEQ(ref_param_out.data(), jit_param_out.data(), numel); }; TestAllImpls(attr, verifier, beta1, beta2, learning_rate, eps, numel, grad, mom1, mom2, param, mom1_out, mom2_out, param_out); } template void TestKernelAdamW() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); const T old_lr = 0.1; const T beta1 = 0.99; const T beta2 = 0.95; const T beta1_pow = beta1 * beta1; const T beta2_pow = beta2 * beta2; const T epsilon = 0.000001; const int64_t numel = 123; const T lr_ratio = 0.2; const T coeff = 0.3; T learning_rate = old_lr * (sqrt(1 - beta2_pow) / (1 - beta1_pow)); T eps = epsilon * sqrt(1 - beta2_pow); std::vector param(numel); std::vector grad(numel); std::vector mom1(numel); std::vector mom2(numel); std::vector param_out(param.size()); std::vector mom1_out(mom1.size()); std::vector mom2_out(mom2.size()); RandomVec(numel, param.data(), 0.5f); RandomVec(numel, grad.data(), 0.5f); RandomVec(numel, mom1.data(), 0.5f); RandomVec(numel, mom2.data(), 0.5f); auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); ref(beta1, beta2, -learning_rate, eps, old_lr, lr_ratio, coeff, numel, grad.data(), mom1.data(), mom2.data(), param.data(), mom1_out.data(), mom2_out.data(), param_out.data()); auto verifier = [](const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr, T eps, T old_lr, T lr_ratio, T coeff, int64_t numel, const std::vector& grad, const std::vector& mom1, const std::vector& mom2, const std::vector& param, const std::vector& ref_mom1_out, const std::vector& ref_mom2_out, const std::vector& ref_param_out) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(param.size(), static_cast(numel)); EXPECT_EQ(grad.size(), static_cast(numel)); EXPECT_EQ(mom1.size(), static_cast(numel)); EXPECT_EQ(mom2.size(), static_cast(numel)); std::vector jit_mom1_out(ref_mom1_out.size()); std::vector jit_mom2_out(ref_mom2_out.size()); std::vector jit_param_out(ref_param_out.size()); tgt(beta1, beta2, -lr, eps, old_lr, lr_ratio, coeff, numel, grad.data(), mom1.data(), mom2.data(), param.data(), jit_mom1_out.data(), jit_mom2_out.data(), jit_param_out.data()); ExpectEQ(ref_mom1_out.data(), jit_mom1_out.data(), numel); ExpectEQ(ref_mom2_out.data(), jit_mom2_out.data(), numel); ExpectEQ(ref_param_out.data(), jit_param_out.data(), numel); }; TestAllImpls(1, verifier, beta1, beta2, learning_rate, eps, old_lr, lr_ratio, coeff, numel, grad, mom1, mom2, param, mom1_out, mom2_out, param_out); } template void TestKernelSgd() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); const T lr = 0.1; auto UnDuplicatedRandomVec = [](int n, const int64_t lower, const int64_t upper) -> std::vector { PADDLE_ENFORCE_LE(static_cast(upper - lower), n - 1, phi::errors::InvalidArgument( "The range of Sgd (upper - lower) should be lower " "than n-1 (Sgd size -1). But the upper - lower is %d " "and n-1 is %d.", static_cast(upper - lower), n - 1)); PADDLE_ENFORCE_GT( n, 0, phi::errors::InvalidArgument( "The Sgd size should be larger than 0. But the n is %d.", n)); std::vector all, out; for (int i = 0; i < n; ++i) { all.push_back(i); } std::random_device rnd; int64_t seed_tmp = rnd(); std::default_random_engine rng(seed_tmp); std::shuffle(all.begin(), all.end(), rng); out.insert(out.begin(), all.begin(), all.begin() + n); return out; }; for (int param_h : {1, 10}) { for (int grad_w : TestSizes()) { std::vector param(param_h * grad_w); std::vector param_out(param_h * grad_w); RandomVec(param_h * grad_w, param.data()); const T* param_data = param.data(); T* out_data = param_out.data(); for (int rows_size = 1; rows_size <= param_h; ++rows_size) { std::vector grad(rows_size * grad_w); std::vector rows = UnDuplicatedRandomVec(rows_size, 0, rows_size - 1); RandomVec(rows_size * grad_w, grad.data()); const int64_t* rows_data = rows.data(); const T* grad_data = grad.data(); auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); ref(&lr, param_data, grad_data, rows_data, out_data, &attr); // inplace test std::vector inp(param.size()); std::copy(param.begin(), param.end(), inp.begin()); T* inp_data = inp.data(); ref(&lr, inp_data, grad_data, rows_data, inp_data, &attr); // only the selected rows should be equal for (int i = 0; i < rows_size; ++i) { ExpectEQ( inp_data + rows[i] * grad_w, out_data + rows[i] * grad_w, grad_w); } auto verifier = [](const typename KernelTuple::func_type tgt, const T lr, const std::vector& param, const std::vector& grad, const std::vector& rows, const std::vector& oref, const typename KernelTuple::attr_type& attr) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(param.size(), static_cast(attr.param_height * attr.param_width)); EXPECT_EQ(grad.size(), static_cast(attr.grad_height * attr.grad_width)); EXPECT_EQ(rows.size(), static_cast(attr.selected_rows_size)); EXPECT_EQ(param.size(), oref.size()); const T* param_data = param.data(); const T* grad_data = grad.data(); const int64_t* rows_data = rows.data(); const T* oref_data = oref.data(); std::vector out(oref.size()); T* o_data = out.data(); tgt(&lr, param_data, grad_data, rows_data, o_data, &attr); // only the selected rows should be equal for (size_t i = 0; i < rows.size(); ++i) { ExpectEQ(o_data + rows[i] * attr.grad_width, oref_data + rows[i] * attr.grad_width, attr.grad_width); } // inplace std::copy(param.begin(), param.end(), out.begin()); tgt(&lr, o_data, grad_data, rows_data, o_data, &attr); for (size_t i = 0; i < rows.size(); ++i) { ExpectEQ(o_data + rows[i] * attr.grad_width, oref_data + rows[i] * attr.grad_width, attr.grad_width); } }; TestAllImpls( attr, verifier, lr, param, grad, rows, param_out, attr); } } } } template void TestKernelVBroadcast() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int w : TestSizes()) { std::vector x(w); RandomVec(w, x.data()); const T* x_data = x.data(); for (int64_t h : {1, 2, 6}) { auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector y(w * h); T* y_data = y.data(); ref(x_data, y_data, h, w); auto verifier = [](const typename KernelTuple::func_type tgt, const std::vector& x, const std::vector& yref, const int64_t& h, const typename KernelTuple::attr_type& attr) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(x.size(), static_cast(attr)); EXPECT_EQ(yref.size(), x.size() * h); std::vector y(yref.size()); const T* x_data = x.data(); const T* yref_data = yref.data(); T* y_data = y.data(); tgt(x_data, y_data, h, attr); ExpectEQ(y_data, yref_data, yref.size()); }; TestAllImpls( static_cast(w), verifier, x, y, h, static_cast(w)); } } } // test pool TEST(JITKernel_pool, jitcreator) { const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators(); #if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) EXPECT_EQ(jitcreators.size(), 0UL); #else EXPECT_EQ(jitcreators.size(), 24UL); #endif } TEST(JITKernel_pool, jitpool) { // jitpool is related with attr const auto& kers = jit::JitCodePool().Instance().AllKernels(); EXPECT_EQ(kers.size(), 0UL); jit::GetAllCandidateKernels, CPUPlace>(3); // after call GetAllCandidateKernels, it will create jitcode Automatically #if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) EXPECT_EQ(kers.size(), 0UL); #else EXPECT_EQ(kers.size(), 1UL); #endif } TEST(JITKernel_pool, more) { const auto& kers = jit::KernelPool::Instance().AllKernels(); size_t target_num = 7; #ifdef __AVX__ target_num += 2; #endif #ifdef PADDLE_WITH_MKLML target_num += 11; #endif EXPECT_EQ(kers.size(), target_num); } TEST(JITKernel_pool, refer) { const auto& kers = jit::ReferKernelPool::Instance().AllKernels(); EXPECT_EQ(kers.size(), 27UL); } // test helper TEST(JITKernel_helper, GetAllCandidateKernels) { auto fp_kers = jit::GetAllCandidateKernels, CPUPlace>(10); #if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) EXPECT_GE(fp_kers.size(), 1UL); // refer #else #ifdef PADDLE_WITH_MKLML EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer #else EXPECT_GE(fp_kers.size(), 2UL); // jitcode, refer #endif #endif auto db_kers = jit::GetAllCandidateKernels, CPUPlace>(10); #if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) EXPECT_GE(db_kers.size(), 1UL); // refer #else #ifdef PADDLE_WITH_MKLML EXPECT_GE(db_kers.size(), 2UL); // mkl, refer #else EXPECT_GE(db_kers.size(), 1UL); // refer #endif #endif } TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) { auto fp_kers = jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); #if defined(__APPLE__) || defined(__OSX__) EXPECT_GE(fp_kers.size(), 1UL); // refer #else #if !defined(PADDLE_WITH_MKLML) || defined(_WIN32) EXPECT_GE(fp_kers.size(), 2UL); // jitcode/mkl, refer #else EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer #endif #endif auto db_kers = jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); #if defined(__APPLE__) || defined(__OSX__) || !defined(PADDLE_WITH_MKLML) EXPECT_GE(db_kers.size(), 1UL); // refer #else EXPECT_GE(db_kers.size(), 2UL); // mkl, refer #endif } TEST(JITKernel_helper, KernelFuncs) { auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; EXPECT_TRUE(f1 != nullptr); EXPECT_TRUE(f1 == f2); auto f3 = jit::KernelFuncs, CPUPlace>::Cache()[5]; #if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) EXPECT_TRUE(f2 == f3); #else EXPECT_TRUE(f2 != f3); #endif } TEST(JITKernel_helper, GetAllCandidateFuncs) { auto funcs = jit::GetAllCandidateFuncs, CPUPlace>(10); auto kers = jit::GetAllCandidateKernels, CPUPlace>(10); EXPECT_EQ(funcs.size(), kers.size()); std::vector x(10), tgt(10); RandomVec(10, x.data()); auto best = jit::GetDefaultBestFunc, CPUPlace>(10); best(x.data(), tgt.data(), 10); for (auto f : funcs) { std::vector y(10); f(x.data(), y.data(), 10); ExpectEQ(y.data(), tgt.data(), 10); } } TEST(JITKernel_helper, pack_weights) { const int N = 8 * 60, K = 2; float src[K][N], yref[K][N], y[K * N]; float* x = &(src[0][0]); float* ref = &(yref[0][0]); for (int i = 0; i < N * K; ++i) { *(x + i) = static_cast(i); } int block = 0; std::vector groups; if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) { block = ZMM_FLOAT_BLOCK; groups.push_back(30); } else { block = YMM_FLOAT_BLOCK; groups.insert(groups.end(), {14, 14, 14, 14, 4}); } int offset = 0; int acc = 0; for (int g : groups) { g = g * block; for (int k = 0; k < K; ++k) { for (int i = 0; i < g; ++i) { *(ref + offset) = src[k][i + acc]; offset++; } } acc += g; } jit::pack_weights(x, y, N, K); ExpectEQ(y, ref, N * K); } TEST(JITKernel_helper, attr) { std::ostringstream out; // KernelTypes out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding) << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1) << jit::to_string(jit::kGRUHtPart1) << jit::to_string(jit::kGRUHtPart2) << jit::to_string(jit::kLSTMCtHt) << jit::to_string(jit::kLSTMC1H1) << jit::to_string(jit::kLayerNorm) << jit::to_string(jit::kMatMul) << jit::to_string(jit::kSeqPool) << jit::to_string(jit::kVAdd) << jit::to_string(jit::kVAddBias) << jit::to_string(jit::kVAddRelu) << jit::to_string(jit::kVBroadcast) << jit::to_string(jit::kVCopy) << jit::to_string(jit::kVExp) << jit::to_string(jit::kVIdentity) << jit::to_string(jit::kVMul) << jit::to_string(jit::kVRelu) << jit::to_string(jit::kVScal) << jit::to_string(jit::kSgd) << jit::to_string(jit::kAdam) << jit::to_string(jit::kVSigmoid) << jit::to_string(jit::kVSquare) << jit::to_string(jit::kVSub) << jit::to_string(jit::kVTanh); EXPECT_EQ(out.str().size(), 208UL); // SeqPoolTypes out.str(""); out << jit::to_string(jit::kSum) << jit::to_string(jit::kAvg) << jit::to_string(jit::kSqrt); EXPECT_EQ(out.str().size(), 13UL); EXPECT_EQ(jit::to_kerneltype("relu"), jit::kVRelu); EXPECT_EQ(jit::to_kerneltype("Identity"), jit::kVIdentity); EXPECT_EQ(jit::to_kerneltype("VEXP"), jit::kVExp); EXPECT_EQ(jit::to_kerneltype("SigmoiD"), jit::kVSigmoid); EXPECT_EQ(jit::to_kerneltype("VTanh"), jit::kVTanh); out.str(""); out << jit::lstm_attr_t(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); EXPECT_EQ(out.str().size(), 89UL); out.str(""); out << jit::gru_attr_t(8, jit::kVIdentity, jit::kVSigmoid); EXPECT_EQ(out.str().size(), 52UL); out.str(""); out << jit::seq_pool_attr_t(8, jit::SeqPoolType::kSum); EXPECT_EQ(out.str().size(), 44UL); out.str(""); out << jit::emb_seq_pool_attr_t(1, 2, 3, 4, 5, jit::SeqPoolType::kAvg); EXPECT_EQ(out.str().size(), 93UL); out.str(""); out << jit::sgd_attr_t(1, 2, 3, 4, 5); EXPECT_EQ(out.str().size(), 81UL); out.str(""); out << jit::matmul_attr_t(1, 2, 3); EXPECT_EQ(out.str().size(), 14UL); } // test keys TEST(JITKernel_key, int) { EXPECT_TRUE(jit::JitCodeKey(2) == jit::JitCodeKey(2)); EXPECT_TRUE(jit::JitCodeKey(2) == jit::JitCodeKey(2)); EXPECT_TRUE(jit::JitCodeKey(2) != jit::JitCodeKey(3)); } TEST(JITKernel_key, gru) { jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh); jit::gru_attr_t attr2(8, jit::kVSigmoid, jit::kVTanh); jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh); jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity); jit::gru_attr_t attr5(9, jit::kVTanh, jit::kVIdentity); auto key1 = jit::JitCodeKey(attr1); auto key2 = jit::JitCodeKey(attr2); auto key3 = jit::JitCodeKey(attr3); auto key4 = jit::JitCodeKey(attr4); auto key5 = jit::JitCodeKey(attr5); EXPECT_TRUE(key1 == key2); EXPECT_TRUE(key2 != key3); EXPECT_TRUE(key2 != key4); EXPECT_TRUE(key2 != key5); EXPECT_TRUE(key3 != key4); EXPECT_TRUE(key3 != key5); EXPECT_TRUE(key4 != key5); } TEST(JITKernel_key, lstm) { jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); jit::lstm_attr_t attr2(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh); jit::lstm_attr_t attr5(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true); jit::lstm_attr_t attr6(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true); auto key1 = jit::JitCodeKey(attr1); auto key2 = jit::JitCodeKey(attr2); auto key3 = jit::JitCodeKey(attr3); auto key4 = jit::JitCodeKey(attr4); auto key5 = jit::JitCodeKey(attr5); auto key6 = jit::JitCodeKey(attr6); EXPECT_TRUE(key1 == key2); EXPECT_TRUE(key2 != key3); EXPECT_TRUE(key2 != key4); EXPECT_TRUE(key2 != key5); EXPECT_TRUE(key3 != key4); EXPECT_TRUE(key3 != key5); EXPECT_TRUE(key4 != key5); EXPECT_TRUE(key5 == key6); } TEST(JITKernel_key, seq_pool) { jit::seq_pool_attr_t attr1(2, jit::SeqPoolType::kSum, 1); jit::seq_pool_attr_t attr2(2, jit::SeqPoolType::kSum, 3); jit::seq_pool_attr_t attr3(3, jit::SeqPoolType::kSum, 3); jit::seq_pool_attr_t attr4(3, jit::SeqPoolType::kAvg, 3); auto key1 = jit::JitCodeKey(attr1); auto key2 = jit::JitCodeKey(attr2); auto key3 = jit::JitCodeKey(attr3); auto key4 = jit::JitCodeKey(attr4); EXPECT_TRUE(key1 == key2); EXPECT_TRUE(key2 != key3); EXPECT_TRUE(key2 != key4); EXPECT_TRUE(key3 != key4); } TEST(JITKernel_key, matmul) { jit::matmul_attr_t attr1(1, 2, 3); jit::matmul_attr_t attr2(1, 2, 3); jit::matmul_attr_t attr3(1, 3, 3); jit::matmul_attr_t attr4(2, 3, 4); auto key1 = jit::JitCodeKey(attr1); auto key2 = jit::JitCodeKey(attr2); auto key3 = jit::JitCodeKey(attr3); auto key4 = jit::JitCodeKey(attr4); EXPECT_TRUE(key1 == key2); EXPECT_TRUE(key2 != key3); EXPECT_TRUE(key2 != key4); EXPECT_TRUE(key3 != key4); } TEST(JITKernel_key, emb_seq_pool) { jit::emb_seq_pool_attr_t attr1(1, 2, 3, 4, 5, jit::SeqPoolType::kSum); jit::emb_seq_pool_attr_t attr2(1, 2, 3, 4, 5, jit::SeqPoolType::kSum); jit::emb_seq_pool_attr_t attr3(10, 2, 9, 8, 7, jit::SeqPoolType::kAvg); jit::emb_seq_pool_attr_t attr4(10, 3, 9, 8, 7, jit::SeqPoolType::kSum); jit::emb_seq_pool_attr_t attr5(1, 6, 3, 4, 5, jit::SeqPoolType::kSum); auto key1 = jit::JitCodeKey(attr1); auto key2 = jit::JitCodeKey(attr2); auto key3 = jit::JitCodeKey(attr3); auto key4 = jit::JitCodeKey(attr4); auto key5 = jit::JitCodeKey(attr5); EXPECT_TRUE(key1 == key2); EXPECT_TRUE(key2 == key3); EXPECT_TRUE(key2 != key4); EXPECT_TRUE(key2 != key5); EXPECT_TRUE(key4 != key5); } TEST(JITKernel_key, adam) { jit::adam_attr_t attr1(0.4f, 0.9f); jit::adam_attr_t attr2(0.4f, 0.9f); jit::adam_attr_t attr3(0.1f, 0.3f); auto key1 = jit::JitCodeKey(attr1); auto key2 = jit::JitCodeKey(attr2); auto key3 = jit::JitCodeKey(attr3); EXPECT_TRUE(key1 == key2); EXPECT_TRUE(key2 != key3); } TEST(JITKernel_key, sgd) { jit::sgd_attr_t attr1(1, 2, 3, 4, 5); jit::sgd_attr_t attr2(1, 2, 3, 4, 5); jit::sgd_attr_t attr3(9, 8, 7, 4, 6); jit::sgd_attr_t attr4(1, 2, 3, 6, 5); jit::sgd_attr_t attr5(10, 9, 8, 7, 6); auto key1 = jit::JitCodeKey(attr1); auto key2 = jit::JitCodeKey(attr2); auto key3 = jit::JitCodeKey(attr3); auto key4 = jit::JitCodeKey(attr4); auto key5 = jit::JitCodeKey(attr5); EXPECT_TRUE(key1 == key2); EXPECT_TRUE(key2 == key3); EXPECT_TRUE(key3 != key4); EXPECT_TRUE(key3 != key5); EXPECT_TRUE(key4 != key5); } // test kernels #define TestKernelVMul TestKernelXYZN #define TestKernelVAdd TestKernelXYZN #define TestKernelVAddRelu TestKernelXYZN #define TestKernelVSub TestKernelXYZN #define TestKernelVScal TestKernelAXYN #define TestKernelVAddBias TestKernelAXYN #define TestKernelVRelu TestKernelXYN #define TestKernelVIdentity TestKernelXYN #define TestKernelVSquare TestKernelXYN #define TestKernelVExp TestKernelXYN #define TestKernelVSigmoid TestKernelXYN #define TestKernelVTanh TestKernelXYN #define TestKernelVCopy TestKernelXYN #define TestKernelLSTMCtHt TestKernelLSTM #define TestKernelLSTMC1H1 TestKernelLSTM #define TestKernelGRUH1 TestKernelGRU #define TestKernelGRUHtPart1 TestKernelGRU #define TestKernelGRUHtPart2 TestKernelGRU #define TEST_CPU_KERNEL(kernel_type) \ TEST(JITKernel, kernel_type) { \ TestKernel##kernel_type, CPUPlace>(); \ TestKernel##kernel_type, CPUPlace>(); \ } TEST_CPU_KERNEL(VMul); TEST_CPU_KERNEL(VAdd); TEST_CPU_KERNEL(VAddRelu); TEST_CPU_KERNEL(VSub); TEST_CPU_KERNEL(VScal); TEST_CPU_KERNEL(VAddBias); TEST_CPU_KERNEL(VRelu); TEST_CPU_KERNEL(VIdentity); TEST_CPU_KERNEL(VSquare); TEST_CPU_KERNEL(VExp); TEST_CPU_KERNEL(VSigmoid); TEST_CPU_KERNEL(VTanh); TEST_CPU_KERNEL(VCopy); TEST_CPU_KERNEL(LSTMCtHt); TEST_CPU_KERNEL(LSTMC1H1); TEST_CPU_KERNEL(GRUH1); TEST_CPU_KERNEL(GRUHtPart1); TEST_CPU_KERNEL(GRUHtPart2); TEST_CPU_KERNEL(LayerNorm); TEST_CPU_KERNEL(CRFDecoding); TEST_CPU_KERNEL(SeqPool); TEST_CPU_KERNEL(EmbSeqPool); TEST_CPU_KERNEL(MatMul); TEST_CPU_KERNEL(Adam); TEST_CPU_KERNEL(AdamW); TEST_CPU_KERNEL(Sgd); TEST_CPU_KERNEL(VBroadcast);