From 81177258522c11340c8b91a1bbcd4de1479786df Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 25 Jan 2019 08:43:20 +0000 Subject: [PATCH] add jit kernel hsum, hmax and softmax refer code test=develop --- paddle/fluid/operators/jit/benchmark.cc | 101 ++++---- paddle/fluid/operators/jit/helper.cc | 3 + paddle/fluid/operators/jit/kernel_base.h | 15 ++ .../fluid/operators/jit/refer/CMakeLists.txt | 3 + paddle/fluid/operators/jit/refer/refer.cc | 5 + paddle/fluid/operators/jit/refer/refer.h | 39 +++ paddle/fluid/operators/jit/test.cc | 222 +++++++++++------- paddle/fluid/platform/dynload/mklml.h | 2 + 8 files changed, 269 insertions(+), 121 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 186c37c56ec..383532d8d22 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -158,7 +158,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { using Tensor = paddle::framework::Tensor; -template +template void BenchXYZNKernel() { for (int d : TestSizes()) { Tensor x, y, z; @@ -175,7 +175,7 @@ void BenchXYZNKernel() { } } -template +template void BenchAXYNKernel() { for (int d : TestSizes()) { const T a = static_cast(3); @@ -190,7 +190,17 @@ void BenchAXYNKernel() { } } -template +template +void BenchXRNKernel() { + for (int d : TestSizes()) { + Tensor x; + RandomVec(d, x.mutable_data({d}, PlaceType())); + T res; + BenchAllImpls, PlaceType>(d, x.data(), &res, d); + } +} + +template void BenchXYNKernel() { for (int d : TestSizes()) { Tensor x, y; @@ -203,7 +213,7 @@ void BenchXYNKernel() { } } -template +template void BenchLSTMKernel() { for (bool use_peephole : {true, false}) { for (int d : TestSizes()) { @@ -240,7 +250,7 @@ void BenchLSTMKernel() { } } -template +template void BenchGRUKernel() { for (int d : TestSizes()) { const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); @@ -262,7 +272,7 @@ void BenchGRUKernel() { } } -template +template void BenchSeqPoolKernel() { std::vector pool_types = { jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; @@ -284,7 +294,7 @@ void BenchSeqPoolKernel() { } } -template +template void BenchMatMulKernel() { for (int m : {1, 2, 3, 4}) { for (int n : TestSizes()) { @@ -305,57 +315,64 @@ void BenchMatMulKernel() { } } +template +void BenchSoftmaxKernel() { + for (int bs : {1, 2, 10}) { + for (int n : TestSizes()) { + Tensor x, y; + x.Resize({bs, n}); + y.Resize({bs, n}); + RandomVec(bs * n, x.mutable_data(PlaceType()), -2.f, 2.f); + const T* x_data = x.data(); + T* y_data = y.mutable_data(PlaceType()); + BenchAllImpls, PlaceType>(n, x_data, y_data, n, + bs); + } + } +} + using T = float; -using PlaceType = paddle::platform::CPUPlace; +using CPUPlace = paddle::platform::CPUPlace; // xyzn -BENCH_FP32_CPU(kVMul) { BenchXYZNKernel(); } - -BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel(); } - -BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel(); } - -BENCH_FP32_CPU(kVSub) { BenchXYZNKernel(); } +BENCH_FP32_CPU(kVMul) { BenchXYZNKernel(); } +BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel(); } +BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel(); } +BENCH_FP32_CPU(kVSub) { BenchXYZNKernel(); } // axyn -BENCH_FP32_CPU(kVScal) { BenchAXYNKernel(); } +BENCH_FP32_CPU(kVScal) { BenchAXYNKernel(); } +BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel(); } -BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel(); } +// xrn +BENCH_FP32_CPU(kHSum) { BenchXRNKernel(); } +BENCH_FP32_CPU(kHMax) { BenchXRNKernel(); } // xyn -BENCH_FP32_CPU(kVRelu) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVRelu) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } // lstm and peephole -BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } - -BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel(); } +BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } +BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel(); } // gru functions -BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel(); } - -BENCH_FP32_CPU(kGRUHtPart1) { - BenchGRUKernel(); -} - -BENCH_FP32_CPU(kGRUHtPart2) { - BenchGRUKernel(); -} +BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel(); } +BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel(); } +BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel(); } // seq pool function -BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel(); } +BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel(); } // matmul -BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel(); } +BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel(); } + +// softmax +BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel(); } // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 5dbe22a81b4..4dac2f2460f 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -49,6 +49,9 @@ const char* to_string(KernelType kt) { ONE_CASE(kNCHW16CMulNC); ONE_CASE(kSeqPool); ONE_CASE(kMatMul); + ONE_CASE(kHMax); + ONE_CASE(kHSum); + ONE_CASE(kSoftmax); default: PADDLE_THROW("Not support type: %d, or forget to add it.", kt); return "NOT JITKernel"; diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index adb101bd5cd..42a58580f7b 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -20,6 +20,7 @@ namespace paddle { namespace operators { namespace jit { +// TODO(TJ): reorder by alphabet typedef enum { kNone = 0, kVMul = 1, @@ -44,6 +45,9 @@ typedef enum { kNCHW16CMulNC, kSeqPool, kMatMul, + kHSum, // horizontal max + kHMax, // horizontal sum + kSoftmax, } KernelType; typedef enum { @@ -70,6 +74,10 @@ struct XYNTuples { typedef void (*func_type)(const T*, T*, int); }; +// x, return and int +template +struct XRNTuples : public XYNTuples {}; + typedef struct { void* gates; // gates: x_ch, x_ih, x_fh, x_oh const void* ct_1; @@ -159,6 +167,13 @@ struct LayerNormTuples { const float, int); }; +template +struct SoftmaxTuples { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const T*, T*, int, int); +}; + // nChw16c = nChw16c .* NC template struct NCHW16CMulNCTuples { diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index 4b9bc5e8d49..9f2935828ca 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -29,3 +29,6 @@ USE_JITKERNEL_REFER(kNCHW16CMulNC) USE_JITKERNEL_REFER(kSeqPool) USE_JITKERNEL_REFER(kMatMul) USE_JITKERNEL_REFER(kVSquare) +USE_JITKERNEL_REFER(kHSum) +USE_JITKERNEL_REFER(kHMax) +USE_JITKERNEL_REFER(kSoftmax) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 3512ad7fe79..b8adb40ec7e 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -52,4 +52,9 @@ REGISTER_REFER_KERNEL(kSeqPool, SeqPool); REGISTER_REFER_KERNEL(kMatMul, MatMul); +REGISTER_REFER_KERNEL(kHMax, HMax); +REGISTER_REFER_KERNEL(kHSum, HSum); + +REGISTER_REFER_KERNEL(kSoftmax, Softmax); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 97d02935859..5a074db7e0e 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -378,6 +378,40 @@ void MatMul(const T* A, const T* B, T* C, int M, int N, int K) { } } +template +void HMax(const T* x, T* res, int n) { + res[0] = x[0]; + for (int i = 1; i < n; ++i) { + res[0] = res[0] < x[i] ? x[i] : res[0]; + } +} + +template +void HSum(const T* x, T* res, int n) { + res[0] = x[0]; + for (int i = 1; i < n; ++i) { + res[0] += x[i]; + } +} + +// y = e^(x - max(x)) +// y = y / sum(y) +template +void Softmax(const T* x, T* y, int n, int bs = 1) { + for (int i = 0; i < bs; ++i) { + T scalar; + HMax(x, &scalar, n); + scalar = static_cast(0) - scalar; + VAddBias(&scalar, x, y, n); // x - max + VExp(y, y, n); + HSum(y, &scalar, n); + scalar = static_cast(1) / scalar; + VScal(&scalar, y, y, n); + x += n; + y += n; + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -421,6 +455,11 @@ DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples); DECLARE_REFER_KERNEL(MatMul, MatMulTuples); +DECLARE_REFER_KERNEL(HMax, XRNTuples); +DECLARE_REFER_KERNEL(HSum, XRNTuples); + +DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 68a79b6314e..2578b282ab0 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -61,6 +61,7 @@ std::vector TestSizes() { } namespace jit = paddle::operators::jit; +using CPUPlace = paddle::platform::CPUPlace; template struct TestFuncWithRefer { @@ -121,6 +122,40 @@ struct TestFuncWithRefer, T, std::vector, } }; +template +struct TestFuncWithRefer, std::vector, std::vector, + int, int> { + void operator()(const typename jit::SoftmaxTuples::func_type tgt, + const std::vector& x, const std::vector& yref, int n, + int bs) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + EXPECT_EQ(x.size(), static_cast(n * bs)); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + std::vector ytgt(n * bs); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + } +}; + +template +struct TestFuncWithRefer, std::vector, T> { + void operator()(const typename jit::XRNTuples::func_type tgt, + const std::vector& x, const T ref_res) { + EXPECT_TRUE(tgt != nullptr); + T tgt_res; + tgt(x.data(), &tgt_res, x.size()); + ExpectEQ(&tgt_res, &ref_res, 1); + } +}; + template struct TestFuncWithRefer, std::vector, std::vector> { void operator()(const typename jit::XYNTuples::func_type tgt, @@ -172,7 +207,7 @@ struct TestFuncWithRefer, std::vector, std::vector, T* ht_data = ht.data(); T* checked_data = checked.data(); - paddle::operators::jit::lstm_t step; + jit::lstm_t step; step.gates = x_data; step.ct_1 = ct_1_data; step.ct = ct_data; @@ -208,7 +243,7 @@ struct TestFuncWithRefer, std::vector, std::vector, const T* ht_ref_data = ht_ref.data(); T* x_data = x.data(); T* ht_data = ht.data(); - paddle::operators::jit::gru_t step; + jit::gru_t step; step.gates = x_data; step.ht_1 = ht_1_data; step.ht = ht_data; @@ -255,8 +290,8 @@ struct TestFuncWithRefer, std::vector, std::vector, } }; -template +template void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { TestFuncWithRefer test; // test jitcode @@ -286,9 +321,8 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { test(tgt, args...); } -template +template void TestXYZNKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); for (int d : TestSizes()) { auto ref = jit::GetRefer>(); @@ -320,9 +354,8 @@ void TestXYZNKernel() { } } -template +template void TestAXYNKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); for (int d : TestSizes()) { auto ref = jit::GetRefer>(); @@ -347,9 +380,23 @@ void TestAXYNKernel() { } } -template +template +void TestXRNKernel() { + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + for (int d : TestSizes()) { + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector x(d); + RandomVec(d, x.data()); + T ref_res; + ref(x.data(), &ref_res, d); + TestAllImpls, PlaceType, std::vector, T>(d, x, + ref_res); + } +} + +template void TestXYNKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); for (int d : TestSizes()) { auto ref = jit::GetRefer>(); @@ -373,9 +420,8 @@ void TestXYNKernel() { } } -template +template void TestLSTMKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; for (int d : TestSizes()) { @@ -424,9 +470,8 @@ void TestLSTMKernel() { } } -template +template void TestGRUKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; for (int d : TestSizes()) { @@ -459,7 +504,7 @@ void TestGRUKernel() { } } -template +template void TestSeqPoolKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); std::vector pool_types = { @@ -484,7 +529,7 @@ void TestSeqPoolKernel() { } } -template +template void TestMatMulKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); auto last_acc = acc; @@ -510,7 +555,32 @@ void TestMatMulKernel() { acc = last_acc; } -template +template +void TestSoftmaxKernel() { + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + for (int bs : {1, 2, 10}) { + for (int n : TestSizes()) { + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector x(bs * n), y(bs * n); + RandomVec(bs * n, x.data(), -2.f, 2.f); + const T* x_data = x.data(); + T* y_data = y.data(); + + std::vector xinp(x.size()); // inplace test + std::copy(x.begin(), x.end(), xinp.begin()); + ref(x_data, y_data, n, bs); + T* xinp_data = xinp.data(); + ref(xinp_data, xinp_data, n, bs); + ExpectEQ(xinp_data, y_data, n * bs); + + TestAllImpls, PlaceType, std::vector, + std::vector>(n, x, y, n, bs); + } + } +} + +template void TestNCHW16CMulNCKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); const int n = 3, c = 16 * 4, h = 10, w = 10; @@ -565,129 +635,123 @@ void TestNCHW16CMulNCKernel() { // XYZNTuple TEST(JITKernel, kVMul) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); + TestXYZNKernel(); + TestXYZNKernel(); } TEST(JITKernel, kVAdd) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); + TestXYZNKernel(); + TestXYZNKernel(); } TEST(JITKernel, kVAddRelu) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); + TestXYZNKernel(); + TestXYZNKernel(); } TEST(JITKernel, kVSub) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); + TestXYZNKernel(); + TestXYZNKernel(); } // AXYNTuples TEST(JITKernel, kVScal) { - namespace jit = paddle::operators::jit; - TestAXYNKernel(); - TestAXYNKernel(); + TestAXYNKernel(); + TestAXYNKernel(); } TEST(JITKernel, kVAddBias) { - namespace jit = paddle::operators::jit; - TestAXYNKernel(); - TestAXYNKernel(); + TestAXYNKernel(); + TestAXYNKernel(); +} + +// XRNTuples +TEST(JITKernel, kHMax) { + TestXRNKernel(); + TestXRNKernel(); +} + +TEST(JITKernel, kHSum) { + TestXRNKernel(); + TestXRNKernel(); } // XYNTuples TEST(JITKernel, kVRelu) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVIdentity) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVSquare) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVExp) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVSigmoid) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVTanh) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } // LSTM TEST(JITKernel, kLSTMCtHt) { - namespace jit = paddle::operators::jit; - TestLSTMKernel(); - TestLSTMKernel(); + TestLSTMKernel(); + TestLSTMKernel(); } TEST(JITKernel, kLSTMC1H1) { - namespace jit = paddle::operators::jit; - TestLSTMKernel(); - TestLSTMKernel(); + TestLSTMKernel(); + TestLSTMKernel(); } // GRU TEST(JITKernel, kGRUH1) { - namespace jit = paddle::operators::jit; - TestGRUKernel(); - TestGRUKernel(); + TestGRUKernel(); + TestGRUKernel(); } TEST(JITKernel, kGRUHtPart1) { - namespace jit = paddle::operators::jit; - TestGRUKernel(); - TestGRUKernel(); + TestGRUKernel(); + TestGRUKernel(); } TEST(JITKernel, kGRUHtPart2) { - namespace jit = paddle::operators::jit; - TestGRUKernel(); - TestGRUKernel(); + TestGRUKernel(); + TestGRUKernel(); } TEST(JITKernel, kSeqPool) { - namespace jit = paddle::operators::jit; - TestSeqPoolKernel(); - TestSeqPoolKernel(); + TestSeqPoolKernel(); + TestSeqPoolKernel(); } TEST(JITKernel, kMatMul) { - namespace jit = paddle::operators::jit; - TestMatMulKernel(); - TestMatMulKernel(); + TestMatMulKernel(); + TestMatMulKernel(); +} + +TEST(JITKernel, kSoftmax) { + TestSoftmaxKernel(); + TestSoftmaxKernel(); } TEST(JITKernel, kNCHW16CMulNC) { - namespace jit = paddle::operators::jit; - TestNCHW16CMulNCKernel(); - TestNCHW16CMulNCKernel(); + TestNCHW16CMulNCKernel(); + TestNCHW16CMulNCKernel(); } // TODO(yihua/TJ): add crf decoding and layer norm unit tests diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index d0619293acf..a260cda4913 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -70,6 +70,8 @@ extern void* mklml_dso_handle; __macro(cblas_ddot); \ __macro(cblas_sasum); \ __macro(cblas_dasum); \ + __macro(cblas_isamax); \ + __macro(cblas_idamax); \ __macro(cblas_sscal); \ __macro(cblas_dscal); \ __macro(vsAdd); \ -- GitLab