diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index b39ce280939515ec8f4fa3b443ff4332074825fd..74d6a87247821eb1d17cc97b8d8b4bcf1c832f79 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -22,10 +22,54 @@ #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/variant.h" // for UNUSED DEFINE_int32(burning, 10, "Burning times."); DEFINE_int32(repeat, 3000, "Repeat times."); DEFINE_int32(max_size, 1000, "The Max size would be tested."); +DEFINE_string(filter, "", "The Benchmark name would be run."); + +class BenchJITKernel { + public: + BenchJITKernel() = default; + virtual ~BenchJITKernel() = default; + virtual void Run() = 0; + virtual const char* Name() = 0; + virtual const char* Dtype() = 0; + virtual const char* Place() = 0; +}; + +static std::vector g_all_benchmarks; + +BenchJITKernel* InsertBenchmark(BenchJITKernel* b) { + g_all_benchmarks.push_back(b); + return b; +} + +#define BENCH_JITKERNEL(name, dtype, place) \ + class BenchJITKernel_##name##_##dtype##_##place##_ : public BenchJITKernel { \ + public: \ + const char* Name() override { return #name; } \ + const char* Dtype() override { return #dtype; } \ + const char* Place() override { return #place; } \ + void Run() override; \ + }; \ + static auto inserted_##name##_##dtype##_##place##_ UNUSED = \ + InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \ + void BenchJITKernel_##name##_##dtype##_##place##_::Run() + +#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU) + +void RUN_ALL_BENCHMARK() { + for (auto p : g_all_benchmarks) { + if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) { + continue; + } + LOG(INFO) << "Benchmark " << p->Name() << "." << p->Dtype() << "." + << p->Place(); + p->Run(); + } +} template void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), @@ -228,49 +272,70 @@ void BenchMatMulKernel() { } } +using T = float; +using PlaceType = paddle::platform::CPUPlace; + +// xyzn +BENCH_FP32_CPU(kVMul) { BenchXYZNKernel(); } + +BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel(); } + +BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel(); } + +BENCH_FP32_CPU(kVSub) { BenchXYZNKernel(); } + +// axyn +BENCH_FP32_CPU(kVScal) { BenchAXYNKernel(); } + +BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel(); } + +// xyn +BENCH_FP32_CPU(kVRelu) { BenchXYNKernel(); } + +BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel(); } + +BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } + +BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } + +BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } + +BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } + +// lstm and peephole +BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } + +BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel(); } + +// gru functions +BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel(); } + +BENCH_FP32_CPU(kGRUHtPart1) { + BenchGRUKernel(); +} + +BENCH_FP32_CPU(kGRUHtPart2) { + BenchGRUKernel(); +} + +// seq pool function +BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel(); } + +// matmul +BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel(); } + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: // --burning: the burning time before count // --repeat: the repeat times // --max_size: the max size would be tested +// --filter: the bench name would be run int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, true); google::InitGoogleLogging(argv[0]); LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat << " times."; - using T = float; - using PlaceType = paddle::platform::CPUPlace; - // xyzn - BenchXYZNKernel(); - BenchXYZNKernel(); - BenchXYZNKernel(); - BenchXYZNKernel(); - - // axyn - BenchAXYNKernel(); - BenchAXYNKernel(); - - // xyn - BenchXYNKernel(); - BenchXYNKernel(); - BenchXYNKernel(); - BenchXYNKernel(); - BenchXYNKernel(); - BenchXYNKernel(); - - // lstm and peephole - BenchLSTMKernel(); - BenchLSTMKernel(); - - // gru functions - BenchGRUKernel(); - BenchGRUKernel(); - BenchGRUKernel(); - - // seq pool function - BenchSeqPoolKernel(); - // matmul - BenchMatMulKernel(); + RUN_ALL_BENCHMARK(); } diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index f4415a54ca9678c75038a820bb5d212e61593ec7..68a79b6314e4cf86f5b715b9c6694924126b12da 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -22,6 +22,8 @@ #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/place.h" +static double acc = 1e-5; + template void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), const T upper = static_cast(20.f)) { @@ -37,7 +39,7 @@ template void ExpectEQ(const T* target, const T* refer, int n) { if (std::is_floating_point::value) { for (int i = 0; i < n; ++i) { - EXPECT_NEAR(target[i], refer[i], 1e-5); + EXPECT_NEAR(target[i], refer[i], acc); } } else { for (int i = 0; i < n; ++i) { @@ -62,7 +64,9 @@ namespace jit = paddle::operators::jit; template struct TestFuncWithRefer { - void operator()(const typename KernelTuples::func_type tgt, Args... args) {} + void operator()(const typename KernelTuples::func_type tgt, Args... args) { + LOG(FATAL) << "Should specify this function."; + } }; template @@ -140,7 +144,8 @@ struct TestFuncWithRefer, std::vector, std::vector> { template struct TestFuncWithRefer, std::vector, std::vector, - std::vector, std::vector, std::vector> { + std::vector, std::vector, std::vector, + typename jit::LSTMTuples::attr_type> { void operator()(const typename jit::LSTMTuples::func_type tgt, const std::vector& xsrc, const std::vector& wp, const std::vector& ct_1, const std::vector& ct_ref, @@ -185,7 +190,8 @@ struct TestFuncWithRefer, std::vector, std::vector, template struct TestFuncWithRefer, std::vector, std::vector, - std::vector> { + std::vector, + typename jit::GRUTuples::attr_type> { void operator()(const typename jit::GRUTuples::func_type tgt, const std::vector& xsrc, const std::vector& ht_1, const std::vector& ht_ref, @@ -212,8 +218,8 @@ struct TestFuncWithRefer, std::vector, std::vector, }; template -struct TestFuncWithRefer, std::vector, - std::vector> { +struct TestFuncWithRefer, std::vector, std::vector, + typename jit::SeqPoolTuples::attr_type> { void operator()(const typename jit::SeqPoolTuples::func_type tgt, const std::vector& x, const std::vector& yref, const typename jit::SeqPoolTuples::attr_type& attr) { @@ -385,8 +391,8 @@ void TestLSTMKernel() { std::vector xsrc(4 * d), wp(3 * d), ct_1(d); std::vector ct_ref(d), ht_ref(d), checked(2 * d); RandomVec(4 * d, xsrc.data(), -2.f, 2.f); - RandomVec(3 * d, wp.data(), -2.f, 2.f); - RandomVec(d, ct_1.data(), -2.f, 2.f); + RandomVec(3 * d, wp.data(), -1.f, 1.f); + RandomVec(d, ct_1.data(), -1.f, 1.f); // x could be changed after compute, so copy to save src std::vector x(xsrc.size()); std::copy(xsrc.begin(), xsrc.end(), x.begin()); @@ -481,14 +487,17 @@ void TestSeqPoolKernel() { template void TestMatMulKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + auto last_acc = acc; + // TODO(intel): this should be acc issue of MKL + acc = 1e-3; for (int m : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) { for (int k : TestSizes()) { auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); std::vector a(m * k), b(k * n), c(m * n); - RandomVec(m * k, a.data(), -0.2f, 0.2f); - RandomVec(k * n, b.data(), -0.2f, 0.2f); + RandomVec(m * k, a.data(), -2.f, 2.f); + RandomVec(k * n, b.data(), -2.f, 2.f); const T* a_data = a.data(); const T* b_data = b.data(); T* c_data = c.data(); @@ -498,6 +507,7 @@ void TestMatMulKernel() { } } } + acc = last_acc; } template