From 1b9438079452a3615f4c9f3dac22e083727ea511 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 1 Aug 2022 15:52:38 +0800 Subject: [PATCH] fix(dnn): fix reduce sum/mean error when b is large GitOrigin-RevId: d1bae619b1835ebe7ef7656766700720c3a99d37 --- dnn/src/fallback/reduce/opr_impl.cpp | 99 ++++++---- dnn/src/fallback/reduce/opr_impl.h | 3 + dnn/src/fallback/reduce/reducer.h | 277 ++++++++++++++++++++++++++- dnn/test/fallback/reduce.cpp | 72 +++++++ 4 files changed, 410 insertions(+), 41 deletions(-) diff --git a/dnn/src/fallback/reduce/opr_impl.cpp b/dnn/src/fallback/reduce/opr_impl.cpp index c87bfca0a..5bc78dd63 100644 --- a/dnn/src/fallback/reduce/opr_impl.cpp +++ b/dnn/src/fallback/reduce/opr_impl.cpp @@ -5,7 +5,6 @@ #include "midout.h" #include "reducer.h" -#include "src/common/reduce_helper.h" MIDOUT_DECL(megdnn_fb_reduce_op) MIDOUT_DECL(megdnn_fb_reduce_c) @@ -67,6 +66,27 @@ void reduce_exec(size_t A, size_t B, size_t C, Op op) MEGDNN_NOEXCEPT { namespace megdnn { namespace fallback { +size_t ReduceImpl::get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& dst) { + MEGDNN_MARK_USED_VAR(src); + MEGDNN_MARK_USED_VAR(dst); + + if (src.dtype.enumv() == DTypeEnum::Float32 && + (param().mode == Mode::MEAN || param().mode == Mode::SUM || + param().mode == Mode::SUM_SQR)) { + size_t A, B, C; + reduce::get_ABC(src, A, B, C, param().axis); + if (C == 1) { + // Using B = 247 as an example, you can understand why these parameters exist + size_t _60xT_in_4 = (60 * 3) / 4; // T = 3 + size_t _60xX_in_4 = 4; // 0 < X < T, X = 1,2. + size_t _XXxT_in_4 = 4; + return ((B / _60xT_in_4 + _60xX_in_4 + _XXxT_in_4) * sizeof(float)); + } + } + return naive::ReduceForwardImpl::get_workspace_in_bytes(src, dst); +} + void ReduceImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { check_exec(src.layout, dst.layout, workspace.size); @@ -178,45 +198,52 @@ void ReduceImpl::exec_fallback( } bool ReduceImpl::exec_optimized( - _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) { + _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { size_t A, B, C; reduce::get_ABC(src.layout, A, B, C, param().axis); bool execed = false; using Mode = param::Reduce::Mode; -#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \ - if (C == 1) { \ - using _Reducer = Reducer; \ - using _ReducerC1SmallB = Reducer; \ - std::function \ - do_reduce = Exec<_Reducer, true>::do_reduce; \ - if (B == 2) \ - do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \ - if (B == 3) \ - do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \ - if (B == 4) \ - do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \ - MIDOUT_BEGIN( \ - megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \ - midout_iv(0)) { \ - MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ - reinterpret_cast(src.raw_ptr()), \ - reinterpret_cast(dst.raw_ptr()), src_type, A, B, C)); \ - execed = true; \ - } \ - MIDOUT_END(); \ - } else { \ - using _Reducer = Reducer; \ - std::function \ - do_reduce = Exec<_Reducer, false>::do_reduce; \ - MIDOUT_BEGIN( \ - megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \ - midout_iv(1)) { \ - MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ - reinterpret_cast(src.raw_ptr()), \ - reinterpret_cast(dst.raw_ptr()), src_type, A, B, C)); \ - execed = true; \ - } \ - MIDOUT_END(); \ + +#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \ + if (C == 1) { \ + using _Reducer = Reducer; \ + using _ReducerC1SmallB = Reducer; \ + std::function \ + do_reduce = Exec<_Reducer, true>::do_reduce; \ + if (B == 2) \ + do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \ + if (B == 3) \ + do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \ + if (B == 4) \ + do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \ + MIDOUT_BEGIN( \ + megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \ + midout_iv(0)) { \ + MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ + reinterpret_cast(src.raw_ptr()), \ + reinterpret_cast(dst.raw_ptr()), src_type, A, B, C, \ + workspace)); \ + execed = true; \ + } \ + MIDOUT_END(); \ + } else { \ + using _Reducer = Reducer; \ + std::function \ + do_reduce = Exec<_Reducer, false>::do_reduce; \ + MIDOUT_BEGIN( \ + megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \ + midout_iv(1)) { \ + MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ + reinterpret_cast(src.raw_ptr()), \ + reinterpret_cast(dst.raw_ptr()), src_type, A, B, C, \ + workspace)); \ + execed = true; \ + } \ + MIDOUT_END(); \ } #define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type) \ diff --git a/dnn/src/fallback/reduce/opr_impl.h b/dnn/src/fallback/reduce/opr_impl.h index 7271ad486..6ac00d137 100644 --- a/dnn/src/fallback/reduce/opr_impl.h +++ b/dnn/src/fallback/reduce/opr_impl.h @@ -1,4 +1,5 @@ #pragma once +#include "src/common/reduce_helper.h" #include "src/naive/reduce/opr_impl.h" namespace megdnn { @@ -13,6 +14,8 @@ public: _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace); void exec_fallback( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace); + size_t get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& dst) override; }; } // namespace fallback diff --git a/dnn/src/fallback/reduce/reducer.h b/dnn/src/fallback/reduce/reducer.h index ced89d3f6..f269314e8 100644 --- a/dnn/src/fallback/reduce/reducer.h +++ b/dnn/src/fallback/reduce/reducer.h @@ -1,5 +1,6 @@ #pragma once +#include "src/common/unroll_macro.h" #include "src/common/utils.h" #include "src/fallback/general_intrinsic/gi_float.h" #include "src/fallback/general_intrinsic/gi_int.h" @@ -395,14 +396,14 @@ template struct Exec { static void do_reduce( const typename Reducer::ctype* src, typename Reducer::ctype* dst, - DType src_dtype, size_t A, size_t B, size_t C); + DType src_dtype, size_t A, size_t B, size_t C, _megdnn_workspace); }; template struct Exec { static void do_reduce( const typename Reducer::ctype* src, typename Reducer::ctype* dst, - DType src_dtype, size_t A, size_t B, size_t) { + DType src_dtype, size_t A, size_t B, size_t, _megdnn_workspace) { size_t a = 0; for (; a < A; a++) { Reducer reducer0(src_dtype, B); @@ -426,7 +427,7 @@ template struct Exec { static void do_reduce( const typename Reducer::ctype* src, typename Reducer::ctype* dst, - DType src_dtype, size_t A, size_t B, size_t C) { + DType src_dtype, size_t A, size_t B, size_t C, _megdnn_workspace) { for (size_t a = 0; a < A; a++) { size_t c = 0; for (; c + Reducer::SIMD_WIDTH <= C; c += Reducer::SIMD_WIDTH) { @@ -448,10 +449,276 @@ struct Exec { } }; +// function kern_4x15xT() +// 1. Loop the calculation with SIMD_WIDTH x 15 x T as a set of data +// 2. T affects accuracy, i.e. SIMD_ Width x 15 x T data accumulated into SIMD_ Width +// data 3.D0-d14 is used for reading, then bisection and addition, the addition result +// is stored in D15, and D15 is written once in T cycles + +// function kern_4xXXx1() +// Enter this function when the remaining number is less than 60 +// 1. The first switch is to gather the redundant numbers at the end into a vector, +// which can be processed in vector units in subsequent processes +// 2. The second switch loads multiple vectors +// 3. The third switch, binary calculation, results in a vector +#define ImplementC1LargeB(rd_type, coef, case_load, load, for_shift, cal_final_res) \ + template <> \ + struct Exec, true> { \ + using rd_type##Reducer_ = rd_type##Reducer; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ + static constexpr int VREG_NUM = 16; \ + static void kern_4x15xT( \ + const float* read_ptr, size_t& read_idx, float* write_ptr, \ + size_t& write_idx, size_t remain_size, size_t T) { \ + GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, \ + d14, d15; \ + constexpr size_t STEP = SIMD_WIDTH * (VREG_NUM - 1); \ + while (read_idx + STEP <= remain_size) { \ + d15 = GiBroadcastFloat32(0.0); \ + size_t i = 0; \ + for (; read_idx + STEP <= remain_size && i < T; \ + read_idx += STEP, i++) { \ + const float* _read_ptr = read_ptr + read_idx; \ + UNROLL_CALL_RAW(15, load, _read_ptr, read_ptr, write_ptr) \ + d0 = GiAddFloat32(d0, d1); \ + d2 = GiAddFloat32(d2, d3); \ + d4 = GiAddFloat32(d4, d5); \ + d6 = GiAddFloat32(d6, d7); \ + d8 = GiAddFloat32(d8, d9); \ + d10 = GiAddFloat32(d10, d11); \ + d12 = GiAddFloat32(d12, d13); \ + d0 = GiAddFloat32(d0, d2); \ + d4 = GiAddFloat32(d4, d6); \ + d8 = GiAddFloat32(d8, d10); \ + d12 = GiAddFloat32(d12, d14); \ + d0 = GiAddFloat32(d0, d4); \ + d8 = GiAddFloat32(d8, d12); \ + d0 = GiAddFloat32(d0, d8); \ + d15 = GiAddFloat32(d0, d15); \ + } \ + GiStoreFloat32(write_ptr + write_idx, d15); \ + write_idx += SIMD_WIDTH; \ + } \ + } \ + static void kern_4xXXx1( \ + const float* read_ptr, size_t& read_idx, float* write_ptr, \ + size_t& write_idx, size_t remain_size) { \ + size_t block_num = remain_size / SIMD_WIDTH; \ + size_t tail_num = remain_size % SIMD_WIDTH; \ + if (block_num == 0) { \ + for_shift(read_ptr, read_idx, write_ptr, write_idx, tail_num); \ + write_idx += tail_num; \ + } else { \ + GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, \ + d13, d14, d15; \ + float buf[4]; \ + switch (tail_num) { \ + case 3: \ + buf[0] = read_ptr[read_idx + remain_size - 1]; \ + buf[1] = read_ptr[read_idx + remain_size - 2]; \ + buf[2] = read_ptr[read_idx + remain_size - 3]; \ + buf[3] = 0; \ + load(0, buf, read_ptr, write_ptr); \ + break; \ + case 2: \ + buf[0] = read_ptr[read_idx + remain_size - 1]; \ + buf[1] = read_ptr[read_idx + remain_size - 2]; \ + buf[2] = 0; \ + buf[3] = 0; \ + load(0, buf, read_ptr, write_ptr); \ + break; \ + case 1: \ + buf[0] = read_ptr[read_idx + remain_size - 1]; \ + buf[1] = 0; \ + buf[2] = 0; \ + buf[3] = 0; \ + load(0, buf, read_ptr, write_ptr); \ + break; \ + default: \ + d0 = GiBroadcastFloat32(0.0); \ + break; \ + } \ + d15 = d0; \ + remain_size -= tail_num; \ + const float* _read_ptr = read_ptr + read_idx; \ + switch (block_num) { \ + case_load(15, _read_ptr, 14, read_ptr, write_ptr); \ + case_load(14, _read_ptr, 13, read_ptr, write_ptr); \ + case_load(13, _read_ptr, 12, read_ptr, write_ptr); \ + case_load(12, _read_ptr, 11, read_ptr, write_ptr); \ + case_load(11, _read_ptr, 10, read_ptr, write_ptr); \ + case_load(10, _read_ptr, 9, read_ptr, write_ptr); \ + case_load(9, _read_ptr, 8, read_ptr, write_ptr); \ + case_load(8, _read_ptr, 7, read_ptr, write_ptr); \ + case_load(7, _read_ptr, 6, read_ptr, write_ptr); \ + case_load(6, _read_ptr, 5, read_ptr, write_ptr); \ + case_load(5, _read_ptr, 4, read_ptr, write_ptr); \ + case_load(4, _read_ptr, 3, read_ptr, write_ptr); \ + case_load(3, _read_ptr, 2, read_ptr, write_ptr); \ + case_load(2, _read_ptr, 1, read_ptr, write_ptr); \ + case_load(1, _read_ptr, 0, read_ptr, write_ptr); \ + default: \ + break; \ + } \ + d0 = GiAddFloat32(d0, d15); \ + while (block_num > 1) { \ + switch (block_num) { \ + case 15: \ + case 14: \ + d0 = GiAddFloat32(d0, d1); \ + d1 = GiAddFloat32(d2, d3); \ + d2 = GiAddFloat32(d4, d5); \ + d3 = GiAddFloat32(d6, d7); \ + d4 = GiAddFloat32(d8, d9); \ + d5 = GiAddFloat32(d10, d11); \ + d6 = GiAddFloat32(d12, d13); \ + if (block_num & 1) \ + d7 = d14; \ + break; \ + case 13: \ + case 12: \ + d0 = GiAddFloat32(d0, d1); \ + d1 = GiAddFloat32(d2, d3); \ + d2 = GiAddFloat32(d4, d5); \ + d3 = GiAddFloat32(d6, d7); \ + d4 = GiAddFloat32(d8, d9); \ + d5 = GiAddFloat32(d10, d11); \ + if (block_num & 1) \ + d6 = d12; \ + break; \ + case 11: \ + case 10: \ + d0 = GiAddFloat32(d0, d1); \ + d1 = GiAddFloat32(d2, d3); \ + d2 = GiAddFloat32(d4, d5); \ + d3 = GiAddFloat32(d6, d7); \ + d4 = GiAddFloat32(d8, d9); \ + if (block_num & 1) \ + d5 = d10; \ + break; \ + case 9: \ + case 8: \ + d0 = GiAddFloat32(d0, d1); \ + d1 = GiAddFloat32(d2, d3); \ + d2 = GiAddFloat32(d4, d5); \ + d3 = GiAddFloat32(d6, d7); \ + if (block_num & 1) \ + d4 = d8; \ + break; \ + case 7: \ + case 6: \ + d0 = GiAddFloat32(d0, d1); \ + d1 = GiAddFloat32(d2, d3); \ + d2 = GiAddFloat32(d4, d5); \ + if (block_num & 1) \ + d3 = d6; \ + break; \ + case 5: \ + case 4: \ + d0 = GiAddFloat32(d0, d1); \ + d1 = GiAddFloat32(d2, d3); \ + if (block_num & 1) \ + d2 = d4; \ + break; \ + case 3: \ + case 2: \ + d0 = GiAddFloat32(d0, d1); \ + if (block_num & 1) \ + d1 = d2; \ + default: \ + break; \ + } \ + block_num = (block_num + 1) / 2; \ + } \ + GiStoreFloat32(write_ptr + write_idx, d0); \ + write_idx += SIMD_WIDTH; \ + } \ + } \ + static void do_reduce( \ + const float* src, float* dst, DType src_dtype, size_t A, size_t B, \ + size_t, _megdnn_workspace workspace) { \ + MEGDNN_MARK_USED_VAR(src_dtype); \ + float* workspace_ptr = workspace.raw_ptr->as(); \ + constexpr size_t T = 3; \ + for (size_t a = 0; a < A; a++) { \ + size_t remain_size = B; \ + const float* read_ptr = src + a * B; \ + float* write_ptr = workspace_ptr; \ + while (remain_size > SIMD_WIDTH) { \ + size_t read_idx = 0; \ + size_t write_idx = 0; \ + kern_4x15xT( \ + read_ptr, read_idx, write_ptr, write_idx, remain_size, T); \ + kern_4xXXx1( \ + read_ptr, read_idx, write_ptr, write_idx, \ + remain_size - read_idx); \ + remain_size = write_idx; \ + read_ptr = workspace_ptr; \ + } \ + cal_final_res(remain_size, read_ptr, write_ptr, dst, coef); \ + dst++; \ + } \ + } \ + }; + +#define GI_LOAD(SHIFT, PTR, RD_PTR, WR_PTR) \ + d##SHIFT = GiLoadFloat32((PTR) + SIMD_WIDTH * SHIFT); +#define GI_LOAD_THEN_MULT(SHIFT, PTR, RD_PTR, WR_PTR) \ + d##SHIFT = GiLoadFloat32((PTR) + SIMD_WIDTH * SHIFT); \ + if (RD_PTR != WR_PTR) \ + d##SHIFT = GiMultiplyFloat32(d##SHIFT, d##SHIFT); + +#define CASE_GI_LOAD(NUM, PTR, SHIFT, RD_PTR, WR_PTR) \ + case NUM: \ + GI_LOAD(SHIFT, PTR, RD_PTR, WR_PTR) \ + MEGDNN_FALLTHRU +#define CASE_GI_LOAD_THEN_MULT(NUM, PTR, SHIFT, RD_PTR, WR_PTR) \ + case NUM: \ + GI_LOAD_THEN_MULT(SHIFT, PTR, RD_PTR, WR_PTR) \ + MEGDNN_FALLTHRU + +#define FOR_MEAN_AND_SUM(rd_ptr, rd_idx, wr_ptr, wr_idx, tail_num) \ + for (size_t i = 0; i < tail_num; i++) \ + wr_ptr[wr_idx + i] = rd_ptr[rd_idx + i]; +#define FOR_SUM_SQUARE(rd_ptr, rd_idx, wr_ptr, wr_idx, tail_num) \ + if (rd_ptr != wr_ptr) \ + for (size_t i = 0; i < tail_num; i++) \ + wr_ptr[wr_idx + i] = rd_ptr[rd_idx + i] * rd_ptr[rd_idx + i]; \ + else \ + for (size_t i = 0; i < tail_num; i++) \ + wr_ptr[wr_idx + i] = rd_ptr[rd_idx + i]; + +#define CAL_FINAL_RESULT(remain_size, read_ptr, write_ptr, dst_ptr, coef) \ + float val = 0; \ + if (write_ptr != read_ptr) \ + for (size_t i = 0; i < remain_size; i++) \ + val = val + read_ptr[i]; \ + else \ + for (size_t i = 0; i < remain_size; i++) \ + val = val + write_ptr[i]; \ + *dst_ptr = val * coef; +#define CAL_FINAL_SQUARE_RESULT(remain_size, read_ptr, write_ptr, dst_ptr, coef) \ + float val = 0; \ + if (write_ptr != read_ptr) \ + for (size_t i = 0; i < remain_size; i++) \ + val = val + read_ptr[i] * read_ptr[i]; \ + else \ + for (size_t i = 0; i < remain_size; i++) \ + val = val + write_ptr[i]; \ + *dst_ptr = val * coef; + +ImplementC1LargeB( + Mean, 1 / B, CASE_GI_LOAD, GI_LOAD, FOR_MEAN_AND_SUM, CAL_FINAL_RESULT); +ImplementC1LargeB(Sum, 1, CASE_GI_LOAD, GI_LOAD, FOR_MEAN_AND_SUM, CAL_FINAL_RESULT); +ImplementC1LargeB( + SumSqr, 1, CASE_GI_LOAD_THEN_MULT, GI_LOAD_THEN_MULT, FOR_SUM_SQUARE, + CAL_FINAL_SQUARE_RESULT); + template struct ExecC1SmallB { static void do_reduce( - const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C); + const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C, + _megdnn_workspace); }; #define ImplementC1SmallB(_ctype, _gi_type, _gi_ins) \ @@ -459,7 +726,7 @@ struct ExecC1SmallB { struct ExecC1SmallB { \ static void do_reduce( \ const _ctype* src, _ctype* dst, DType src_dtype, size_t A, size_t, \ - size_t) { \ + size_t, _megdnn_workspace) { \ size_t a = 0; \ for (; a + Reducer::SIMD_WIDTH < A; a += Reducer::SIMD_WIDTH) { \ Reducer reducer(src_dtype, B); \ diff --git a/dnn/test/fallback/reduce.cpp b/dnn/test/fallback/reduce.cpp index 02aeb9640..d452e5dea 100644 --- a/dnn/test/fallback/reduce.cpp +++ b/dnn/test/fallback/reduce.cpp @@ -352,6 +352,78 @@ TEST_F(FALLBACK, BENCHMARK_REDUCE_VS_CONV) { }; run(); } + +TEST_F(FALLBACK, BENCHMARK_REDUCE) { + auto run = [&]() { + Benchmarker benchmarker_reduce(handle()); + benchmarker_reduce.set_display(false); + using Mode = param::Reduce::Mode; + + constexpr size_t RUNS = 100; + benchmarker_reduce.set_times(RUNS); + + TensorShape small{3 * 224 * 224}; + TensorShape large{3 * 224 * 224 * 100}; + param::Reduce param; + param.axis = 0; + + for (auto i = 224; i < 224 * 2; i++) { + for (auto mode : {Mode::SUM, Mode::MEAN, Mode::SUM_SQR}) { + param.mode = mode; + benchmarker_reduce.set_param(param); + auto reduce = benchmarker_reduce.execs({{3 * 224 * i}, {}}) / RUNS; + } + } + param.mode = param::Reduce::Mode::SUM; + benchmarker_reduce.set_param(param); + printf("SUM\n"); + { + TensorLayout src(small, dtype::Float32()); + auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS; + + printf("case 1: reduce use time %fms\n", reduce); + } + { + TensorLayout src(large, dtype::Float32()); + auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS; + + printf("case 1: reduce use time %fms\n", reduce); + } + + param.mode = param::Reduce::Mode::MEAN; + benchmarker_reduce.set_param(param); + printf("MEAN\n"); + { + TensorLayout src(small, dtype::Float32()); + auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS; + + printf("case 2: reduce use time %fms\n", reduce); + } + { + TensorLayout src(large, dtype::Float32()); + auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS; + + printf("case 2: reduce use time %fms\n", reduce); + } + + param.mode = param::Reduce::Mode::SUM_SQR; + benchmarker_reduce.set_param(param); + printf("SUM_SQR\n"); + { + TensorLayout src(small, dtype::Float32()); + auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS; + + printf("case 3: reduce use time %fms\n", reduce); + } + { + TensorLayout src(large, dtype::Float32()); + auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS; + + printf("case 3: reduce use time %fms\n", reduce); + } + }; + run(); +} #endif // vim: syntax=cpp.doxygen -- GitLab