提交 1b943807 编写于 作者: M Megvii Engine Team

fix(dnn): fix reduce sum/mean error when b is large

GitOrigin-RevId: d1bae619b1835ebe7ef7656766700720c3a99d37
上级 c7a99098
...@@ -5,7 +5,6 @@ ...@@ -5,7 +5,6 @@
#include "midout.h" #include "midout.h"
#include "reducer.h" #include "reducer.h"
#include "src/common/reduce_helper.h"
MIDOUT_DECL(megdnn_fb_reduce_op) MIDOUT_DECL(megdnn_fb_reduce_op)
MIDOUT_DECL(megdnn_fb_reduce_c) MIDOUT_DECL(megdnn_fb_reduce_c)
...@@ -67,6 +66,27 @@ void reduce_exec(size_t A, size_t B, size_t C, Op op) MEGDNN_NOEXCEPT { ...@@ -67,6 +66,27 @@ void reduce_exec(size_t A, size_t B, size_t C, Op op) MEGDNN_NOEXCEPT {
namespace megdnn { namespace megdnn {
namespace fallback { namespace fallback {
size_t ReduceImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& dst) {
MEGDNN_MARK_USED_VAR(src);
MEGDNN_MARK_USED_VAR(dst);
if (src.dtype.enumv() == DTypeEnum::Float32 &&
(param().mode == Mode::MEAN || param().mode == Mode::SUM ||
param().mode == Mode::SUM_SQR)) {
size_t A, B, C;
reduce::get_ABC(src, A, B, C, param().axis);
if (C == 1) {
// Using B = 247 as an example, you can understand why these parameters exist
size_t _60xT_in_4 = (60 * 3) / 4; // T = 3
size_t _60xX_in_4 = 4; // 0 < X < T, X = 1,2.
size_t _XXxT_in_4 = 4;
return ((B / _60xT_in_4 + _60xX_in_4 + _XXxT_in_4) * sizeof(float));
}
}
return naive::ReduceForwardImpl::get_workspace_in_bytes(src, dst);
}
void ReduceImpl::exec( void ReduceImpl::exec(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
check_exec(src.layout, dst.layout, workspace.size); check_exec(src.layout, dst.layout, workspace.size);
...@@ -178,45 +198,52 @@ void ReduceImpl::exec_fallback( ...@@ -178,45 +198,52 @@ void ReduceImpl::exec_fallback(
} }
bool ReduceImpl::exec_optimized( bool ReduceImpl::exec_optimized(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) { _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
size_t A, B, C; size_t A, B, C;
reduce::get_ABC(src.layout, A, B, C, param().axis); reduce::get_ABC(src.layout, A, B, C, param().axis);
bool execed = false; bool execed = false;
using Mode = param::Reduce::Mode; using Mode = param::Reduce::Mode;
#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \
if (C == 1) { \ #define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \
using _Reducer = Reducer<dtype, ctype, comp_type, true>; \ if (C == 1) { \
using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>; \ using _Reducer = Reducer<dtype, ctype, comp_type, true>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \ using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>; \
do_reduce = Exec<_Reducer, true>::do_reduce; \ std::function<void( \
if (B == 2) \ const ctype*, ctype*, DType, size_t, size_t, size_t, \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \ _megdnn_workspace)> \
if (B == 3) \ do_reduce = Exec<_Reducer, true>::do_reduce; \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \ if (B == 2) \
if (B == 4) \ do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \ if (B == 3) \
MIDOUT_BEGIN( \ do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \ if (B == 4) \
midout_iv(0)) { \ do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ MIDOUT_BEGIN( \
reinterpret_cast<ctype*>(src.raw_ptr()), \ megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \ midout_iv(0)) { \
execed = true; \ MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
} \ reinterpret_cast<ctype*>(src.raw_ptr()), \
MIDOUT_END(); \ reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C, \
} else { \ workspace)); \
using _Reducer = Reducer<dtype, ctype, comp_type, false>; \ execed = true; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \ } \
do_reduce = Exec<_Reducer, false>::do_reduce; \ MIDOUT_END(); \
MIDOUT_BEGIN( \ } else { \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \ using _Reducer = Reducer<dtype, ctype, comp_type, false>; \
midout_iv(1)) { \ std::function<void( \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ const ctype*, ctype*, DType, size_t, size_t, size_t, \
reinterpret_cast<ctype*>(src.raw_ptr()), \ _megdnn_workspace)> \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \ do_reduce = Exec<_Reducer, false>::do_reduce; \
execed = true; \ MIDOUT_BEGIN( \
} \ megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
MIDOUT_END(); \ midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C, \
workspace)); \
execed = true; \
} \
MIDOUT_END(); \
} }
#define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type) \ #define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type) \
......
#pragma once #pragma once
#include "src/common/reduce_helper.h"
#include "src/naive/reduce/opr_impl.h" #include "src/naive/reduce/opr_impl.h"
namespace megdnn { namespace megdnn {
...@@ -13,6 +14,8 @@ public: ...@@ -13,6 +14,8 @@ public:
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace); _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
void exec_fallback( void exec_fallback(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace); _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& dst) override;
}; };
} // namespace fallback } // namespace fallback
......
此差异已折叠。
...@@ -352,6 +352,78 @@ TEST_F(FALLBACK, BENCHMARK_REDUCE_VS_CONV) { ...@@ -352,6 +352,78 @@ TEST_F(FALLBACK, BENCHMARK_REDUCE_VS_CONV) {
}; };
run(); run();
} }
TEST_F(FALLBACK, BENCHMARK_REDUCE) {
auto run = [&]() {
Benchmarker<Reduce> benchmarker_reduce(handle());
benchmarker_reduce.set_display(false);
using Mode = param::Reduce::Mode;
constexpr size_t RUNS = 100;
benchmarker_reduce.set_times(RUNS);
TensorShape small{3 * 224 * 224};
TensorShape large{3 * 224 * 224 * 100};
param::Reduce param;
param.axis = 0;
for (auto i = 224; i < 224 * 2; i++) {
for (auto mode : {Mode::SUM, Mode::MEAN, Mode::SUM_SQR}) {
param.mode = mode;
benchmarker_reduce.set_param(param);
auto reduce = benchmarker_reduce.execs({{3 * 224 * i}, {}}) / RUNS;
}
}
param.mode = param::Reduce::Mode::SUM;
benchmarker_reduce.set_param(param);
printf("SUM\n");
{
TensorLayout src(small, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
printf("case 1: reduce use time %fms\n", reduce);
}
{
TensorLayout src(large, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
printf("case 1: reduce use time %fms\n", reduce);
}
param.mode = param::Reduce::Mode::MEAN;
benchmarker_reduce.set_param(param);
printf("MEAN\n");
{
TensorLayout src(small, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
printf("case 2: reduce use time %fms\n", reduce);
}
{
TensorLayout src(large, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
printf("case 2: reduce use time %fms\n", reduce);
}
param.mode = param::Reduce::Mode::SUM_SQR;
benchmarker_reduce.set_param(param);
printf("SUM_SQR\n");
{
TensorLayout src(small, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
printf("case 3: reduce use time %fms\n", reduce);
}
{
TensorLayout src(large, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
printf("case 3: reduce use time %fms\n", reduce);
}
};
run();
}
#endif #endif
// vim: syntax=cpp.doxygen // vim: syntax=cpp.doxygen
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册