提交 1b943807 编写于 作者: M Megvii Engine Team

fix(dnn): fix reduce sum/mean error when b is large

GitOrigin-RevId: d1bae619b1835ebe7ef7656766700720c3a99d37
上级 c7a99098
......@@ -5,7 +5,6 @@
#include "midout.h"
#include "reducer.h"
#include "src/common/reduce_helper.h"
MIDOUT_DECL(megdnn_fb_reduce_op)
MIDOUT_DECL(megdnn_fb_reduce_c)
......@@ -67,6 +66,27 @@ void reduce_exec(size_t A, size_t B, size_t C, Op op) MEGDNN_NOEXCEPT {
namespace megdnn {
namespace fallback {
size_t ReduceImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& dst) {
MEGDNN_MARK_USED_VAR(src);
MEGDNN_MARK_USED_VAR(dst);
if (src.dtype.enumv() == DTypeEnum::Float32 &&
(param().mode == Mode::MEAN || param().mode == Mode::SUM ||
param().mode == Mode::SUM_SQR)) {
size_t A, B, C;
reduce::get_ABC(src, A, B, C, param().axis);
if (C == 1) {
// Using B = 247 as an example, you can understand why these parameters exist
size_t _60xT_in_4 = (60 * 3) / 4; // T = 3
size_t _60xX_in_4 = 4; // 0 < X < T, X = 1,2.
size_t _XXxT_in_4 = 4;
return ((B / _60xT_in_4 + _60xX_in_4 + _XXxT_in_4) * sizeof(float));
}
}
return naive::ReduceForwardImpl::get_workspace_in_bytes(src, dst);
}
void ReduceImpl::exec(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
check_exec(src.layout, dst.layout, workspace.size);
......@@ -178,45 +198,52 @@ void ReduceImpl::exec_fallback(
}
bool ReduceImpl::exec_optimized(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) {
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
size_t A, B, C;
reduce::get_ABC(src.layout, A, B, C, param().axis);
bool execed = false;
using Mode = param::Reduce::Mode;
#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \
if (C == 1) { \
using _Reducer = Reducer<dtype, ctype, comp_type, true>; \
using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, true>::do_reduce; \
if (B == 2) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \
if (B == 3) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \
if (B == 4) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
midout_iv(0)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
} else { \
using _Reducer = Reducer<dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, false>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \
if (C == 1) { \
using _Reducer = Reducer<dtype, ctype, comp_type, true>; \
using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>; \
std::function<void( \
const ctype*, ctype*, DType, size_t, size_t, size_t, \
_megdnn_workspace)> \
do_reduce = Exec<_Reducer, true>::do_reduce; \
if (B == 2) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \
if (B == 3) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \
if (B == 4) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
midout_iv(0)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C, \
workspace)); \
execed = true; \
} \
MIDOUT_END(); \
} else { \
using _Reducer = Reducer<dtype, ctype, comp_type, false>; \
std::function<void( \
const ctype*, ctype*, DType, size_t, size_t, size_t, \
_megdnn_workspace)> \
do_reduce = Exec<_Reducer, false>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C, \
workspace)); \
execed = true; \
} \
MIDOUT_END(); \
}
#define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type) \
......
#pragma once
#include "src/common/reduce_helper.h"
#include "src/naive/reduce/opr_impl.h"
namespace megdnn {
......@@ -13,6 +14,8 @@ public:
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
void exec_fallback(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& dst) override;
};
} // namespace fallback
......
此差异已折叠。
......@@ -352,6 +352,78 @@ TEST_F(FALLBACK, BENCHMARK_REDUCE_VS_CONV) {
};
run();
}
TEST_F(FALLBACK, BENCHMARK_REDUCE) {
auto run = [&]() {
Benchmarker<Reduce> benchmarker_reduce(handle());
benchmarker_reduce.set_display(false);
using Mode = param::Reduce::Mode;
constexpr size_t RUNS = 100;
benchmarker_reduce.set_times(RUNS);
TensorShape small{3 * 224 * 224};
TensorShape large{3 * 224 * 224 * 100};
param::Reduce param;
param.axis = 0;
for (auto i = 224; i < 224 * 2; i++) {
for (auto mode : {Mode::SUM, Mode::MEAN, Mode::SUM_SQR}) {
param.mode = mode;
benchmarker_reduce.set_param(param);
auto reduce = benchmarker_reduce.execs({{3 * 224 * i}, {}}) / RUNS;
}
}
param.mode = param::Reduce::Mode::SUM;
benchmarker_reduce.set_param(param);
printf("SUM\n");
{
TensorLayout src(small, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
printf("case 1: reduce use time %fms\n", reduce);
}
{
TensorLayout src(large, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
printf("case 1: reduce use time %fms\n", reduce);
}
param.mode = param::Reduce::Mode::MEAN;
benchmarker_reduce.set_param(param);
printf("MEAN\n");
{
TensorLayout src(small, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
printf("case 2: reduce use time %fms\n", reduce);
}
{
TensorLayout src(large, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
printf("case 2: reduce use time %fms\n", reduce);
}
param.mode = param::Reduce::Mode::SUM_SQR;
benchmarker_reduce.set_param(param);
printf("SUM_SQR\n");
{
TensorLayout src(small, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
printf("case 3: reduce use time %fms\n", reduce);
}
{
TensorLayout src(large, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
printf("case 3: reduce use time %fms\n", reduce);
}
};
run();
}
#endif
// vim: syntax=cpp.doxygen
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册