From b3f46734e731b2543ad8534db415e77965933526 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 1 Jun 2022 10:23:14 +0800 Subject: [PATCH] feat(megdnn/softmax): add softmax operator in fallback GitOrigin-RevId: 97bc32f561e199c2a66393f281510aede82a0ea3 --- dnn/src/fallback/handle.cpp | 2 + dnn/src/fallback/softmax/opr_impl.cpp | 163 ++++++++++++++++++++++++++ dnn/src/fallback/softmax/opr_impl.h | 45 +++++++ dnn/test/fallback/softmax.cpp | 56 +++++++++ dnn/test/naive/softmax.cpp | 57 --------- 5 files changed, 266 insertions(+), 57 deletions(-) create mode 100644 dnn/src/fallback/softmax/opr_impl.cpp create mode 100644 dnn/src/fallback/softmax/opr_impl.h create mode 100644 dnn/test/fallback/softmax.cpp diff --git a/dnn/src/fallback/handle.cpp b/dnn/src/fallback/handle.cpp index 93f28003b..aa787c72a 100644 --- a/dnn/src/fallback/handle.cpp +++ b/dnn/src/fallback/handle.cpp @@ -21,6 +21,7 @@ #include "src/fallback/resize/opr_impl.h" #include "src/fallback/roi_copy/opr_impl.h" #include "src/fallback/rotate/opr_impl.h" +#include "src/fallback/softmax/opr_impl.h" #include "src/fallback/split/opr_impl.h" #include "src/fallback/tile/opr_impl.h" #include "src/fallback/type_cvt/opr_impl.h" @@ -50,6 +51,7 @@ MEGDNN_SPECIALIZE_CREATE_OPERATOR(TypeCvt) MEGDNN_SPECIALIZE_CREATE_OPERATOR(GroupLocal) MEGDNN_SPECIALIZE_CREATE_OPERATOR(Flip) MEGDNN_SPECIALIZE_CREATE_OPERATOR(GaussianBlur) +MEGDNN_SPECIALIZE_CREATE_OPERATOR(SoftmaxForward) MEGDNN_SPECIALIZE_CREATE_OPERATOR(ROICopy) MEGDNN_SPECIALIZE_CREATE_OPERATOR(Rotate) MEGDNN_SPECIALIZE_CREATE_OPERATOR(ElemwiseMultiType) diff --git a/dnn/src/fallback/softmax/opr_impl.cpp b/dnn/src/fallback/softmax/opr_impl.cpp new file mode 100644 index 000000000..15623ad4e --- /dev/null +++ b/dnn/src/fallback/softmax/opr_impl.cpp @@ -0,0 +1,163 @@ +#include "src/fallback/softmax/opr_impl.h" +#include +#include +#include "src/fallback/elemwise/gi_impl/gi_mathfun.h" +#include "src/naive/handle.h" + +namespace megdnn { +namespace fallback { +void SoftmaxForwardImpl::exec( + _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { + auto axis = param().axis; + if (axis < 0) + axis += src.layout.ndim; + megdnn_assert(axis >= 0); + check_exec(src.layout, dst.layout, workspace.size); + + if (!usable(src.layout)) { + naive::SoftmaxForwardImpl::exec(src, dst, workspace); + return; + } + + typedef DTypeTrait::ctype Float32; + auto sptr = src.ptr(); + auto dptr = dst.ptr(); + + constexpr auto float_min = std::numeric_limits::min(); + constexpr auto step = GI_SIMD_LEN_BYTE / sizeof(Float32); + + size_t A, B, C; + reduce::get_ABC(src.layout, A, B, C, axis); + + // TODO: When C=2,3,4..., src_ptr span is relatively large, the performance may + // be poor + if (C != 1) { + WorkspaceBundle workspace_bundle{ + workspace.raw_ptr, {A * C * sizeof(Float32), A * C * sizeof(Float32)}}; + Float32* max = workspace_bundle.get_workspace(0).raw_ptr->as(); + GI_FLOAT32_t v_max = GiBroadcastFloat32(float_min); + size_t i = 0; + for (; i + step <= A * C; i += step) + GiStoreFloat32(max + i, v_max); + for (; i < A * C; i++) + max[i] = float_min; + + for (size_t a = 0; a < A; a++) { + for (size_t b = 0; b < B; b++) { + auto max_ptr = max + a * C; + auto limit = max_ptr + C; + auto src_ptr = sptr + a * B * C + b * C; + + for (; max_ptr + step <= limit; max_ptr += step, src_ptr += step) { + GI_FLOAT32_t v_p = GiLoadFloat32(src_ptr); + GI_FLOAT32_t v_max = GiLoadFloat32(max_ptr); + v_max = GiMaximumFloat32(v_max, v_p); + GiStoreFloat32(max_ptr, v_max); + } + for (; max_ptr < limit; ++max_ptr, ++src_ptr) { + *max_ptr = std::max(*src_ptr, *max_ptr); + } + } + } + + Float32* sum = workspace_bundle.get_workspace(1).raw_ptr->as(); + memset(sum, 0, A * C * sizeof(Float32)); + for (size_t a = 0; a < A; a++) { + for (size_t b = 0; b < B; b++) { + auto max_ptr = max + a * C; + auto limit = max_ptr + C; + auto sum_ptr = sum + a * C; + auto src_ptr = sptr + a * B * C + C * b; + auto dst_ptr = dptr + a * B * C + C * b; + for (; max_ptr + step <= limit; max_ptr += step, sum_ptr += step, + src_ptr += step, dst_ptr += step) { + GI_FLOAT32_t v_p = GiLoadFloat32(src_ptr); + GI_FLOAT32_t v_max = GiLoadFloat32(max_ptr); + GI_FLOAT32_t v_sum = GiLoadFloat32(sum_ptr); + v_p = GiExpPsFloat32(GiSubtractFloat32(v_p, v_max)); + v_sum = GiAddFloat32(v_p, v_sum); + GiStoreFloat32(dst_ptr, v_p); + GiStoreFloat32(sum_ptr, v_sum); + } + for (; max_ptr < limit; ++max_ptr, ++sum_ptr, ++src_ptr, ++dst_ptr) { + *dst_ptr = exp(*src_ptr - *max_ptr); + *sum_ptr += *dst_ptr; + } + } + } + + for (size_t a = 0; a < A; a++) { + for (size_t b = 0; b < B; b++) { + auto sum_ptr = sum + a * C; + auto limit = sum_ptr + C; + auto dst_ptr = dptr + a * B * C + C * b; + for (; sum_ptr + step <= limit; sum_ptr += step, dst_ptr += step) { + GI_FLOAT32_t v_p = GiLoadFloat32(dst_ptr); + GI_FLOAT32_t v_sum = GiLoadFloat32(sum_ptr); + v_p = GiDivideFloat32(v_p, v_sum); + GiStoreFloat32(dst_ptr, v_p); + } + for (; sum_ptr < limit; ++sum_ptr, ++dst_ptr) + *dst_ptr = *dst_ptr / *sum_ptr; + } + } + } else { + for (size_t a = 0; a < A; a++) { + auto max = float_min; + { + auto src_ptr = sptr + a * B; + auto limit = src_ptr + B; + GI_FLOAT32_t v_max = GiBroadcastFloat32(max); + + for (; src_ptr + step <= limit; src_ptr += step) { + GI_FLOAT32_t v_p = GiLoadFloat32(src_ptr); + v_max = GiMaximumFloat32(v_max, v_p); + } + max = std::max(max, GiReduceMaxNanFloat32(v_max)); + for (; src_ptr < limit; ++src_ptr) { + max = std::max(*src_ptr, max); + } + } + + auto sum = 0.f; + { + auto src_ptr = sptr + a * B; + auto limit = src_ptr + B; + auto dst_ptr = dptr + a * B; + GI_FLOAT32_t v_sum = GiZeroFloat32(); + GI_FLOAT32_t v_max = GiBroadcastFloat32(max); + + for (; src_ptr + step <= limit; src_ptr += step, dst_ptr += step) { + GI_FLOAT32_t v_p = GiLoadFloat32(src_ptr); + v_p = GiExpPsFloat32(GiSubtractFloat32(v_p, v_max)); + GiStoreFloat32(dst_ptr, v_p); + v_sum = GiAddFloat32(v_sum, v_p); + } + sum += GiReduceAddFloat32(v_sum); + for (; src_ptr < limit; ++src_ptr, ++dst_ptr) { + *dst_ptr = exp(*src_ptr - max); + sum += *dst_ptr; + } + } + { + auto dst_ptr = dptr + a * B; + auto limit = dst_ptr + B; + sum = 1 / sum; + GI_FLOAT32_t v_sum = GiBroadcastFloat32(sum); + for (; dst_ptr + step <= limit; dst_ptr += step) { + GI_FLOAT32_t v_p = GiLoadFloat32(dst_ptr); + v_p = GiMultiplyFloat32(v_p, v_sum); + GiStoreFloat32(dst_ptr, v_p); + } + for (; dst_ptr < limit; ++dst_ptr) { + *dst_ptr *= sum; + } + } + } + } +} + +} // namespace fallback +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/softmax/opr_impl.h b/dnn/src/fallback/softmax/opr_impl.h new file mode 100644 index 000000000..41a1d31ab --- /dev/null +++ b/dnn/src/fallback/softmax/opr_impl.h @@ -0,0 +1,45 @@ +#pragma once +#include "megdnn/tensor_format.h" +#include "src/common/reduce_helper.h" +#include "src/common/utils.h" +#include "src/naive/softmax/opr_impl.h" + +namespace megdnn { +namespace fallback { + +class SoftmaxForwardImpl : public naive::SoftmaxForwardImpl { +public: + using naive::SoftmaxForwardImpl::SoftmaxForwardImpl; + void exec( + _megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) override; + bool usable(const TensorLayout& src) { + return src.is_contiguous() && (src.dtype.enumv() == DTypeEnum::Float32) && + (src.format.type() == TensorFormat::Type::DEFAULT); + } + size_t get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& dst) override { + if (!usable(src)) { + return naive::SoftmaxForwardImpl::get_workspace_in_bytes(src, dst); + } + + auto axis = param().axis; + if (axis < 0) + axis += src.ndim; + typedef DTypeTrait::ctype Float32; + + size_t A, B, C; + reduce::get_ABC(src, A, B, C, axis); + if (C != 1) { + return WorkspaceBundle( + nullptr, {A * C * sizeof(Float32), A * C * sizeof(Float32)}) + .total_size_in_bytes(); + } + + return 0; + } +}; + +} // namespace fallback +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/test/fallback/softmax.cpp b/dnn/test/fallback/softmax.cpp new file mode 100644 index 000000000..bc19f9928 --- /dev/null +++ b/dnn/test/fallback/softmax.cpp @@ -0,0 +1,56 @@ +#include "test/fallback/fixture.h" + +#include "megdnn/oprs.h" +#include "test/common/benchmarker.h" +#include "test/common/checker.h" +#include "test/common/task_record_check.h" +#include "test/common/tensor.h" +#include "test/common/workspace_wrapper.h" + +namespace megdnn { +namespace test { + +TEST_F(FALLBACK, SOFTMAX_FORWARD) { + Checker checker(handle()); + + Softmax::Param param0{0}; + checker.set_param(param0).exec(TensorShapeArray{{11}, {}}); + checker.set_param(param0).exec(TensorShapeArray{{11, 11}, {}}); + checker.set_param(param0).exec(TensorShapeArray{{11, 11, 11}, {}}); + checker.set_param(param0).exec(TensorShapeArray{{11, 11, 11, 11}, {}}); + checker.set_param(param0).exec(TensorShapeArray{{11, 11, 11, 11, 11}, {}}); + checker.set_param(param0).exec(TensorShapeArray{{11, 7, 5, 5, 5, 11}, {}}); + checker.set_param(param0).exec(TensorShapeArray{{11, 7, 5, 7, 5, 7, 7}, {}}); + Softmax::Param param1{1}; + checker.set_param(param1).exec(TensorShapeArray{{11, 11}, {}}); + checker.set_param(param1).exec(TensorShapeArray{{11, 11, 11}, {}}); + checker.set_param(param1).exec(TensorShapeArray{{11, 11, 11, 11}, {}}); + checker.set_param(param1).exec(TensorShapeArray{{11, 11, 11, 11, 11}, {}}); + checker.set_param(param1).exec(TensorShapeArray{{11, 5, 5, 5, 5, 11}, {}}); + checker.set_param(param1).exec(TensorShapeArray{{11, 7, 5, 7, 5, 7, 7}, {}}); + Softmax::Param param2{2}; + checker.set_param(param2).exec(TensorShapeArray{{11, 11, 11}, {}}); + checker.set_param(param2).exec(TensorShapeArray{{11, 11, 11, 11}, {}}); + checker.set_param(param2).exec(TensorShapeArray{{11, 11, 11, 11, 11}, {}}); + checker.set_param(param2).exec(TensorShapeArray{{11, 5, 5, 5, 5, 11}, {}}); + checker.set_param(param2).exec(TensorShapeArray{{11, 5, 5, 5, 5, 7, 7}, {}}); + Softmax::Param param3{3}; + checker.set_param(param3).exec(TensorShapeArray{{11, 11, 11, 11}, {}}); + checker.set_param(param3).exec(TensorShapeArray{{11, 11, 11, 11, 11}, {}}); + checker.set_param(param3).exec(TensorShapeArray{{11, 5, 5, 5, 5, 11}, {}}); + checker.set_param(param3).exec(TensorShapeArray{{11, 5, 5, 5, 5, 7, 7}, {}}); + Softmax::Param param4{4}; + checker.set_param(param4).exec(TensorShapeArray{{11, 11, 11, 11, 11}, {}}); + checker.set_param(param4).exec(TensorShapeArray{{11, 5, 5, 5, 5, 11}, {}}); + checker.set_param(param4).exec(TensorShapeArray{{11, 5, 5, 5, 5, 7, 7}, {}}); + Softmax::Param param5{5}; + checker.set_param(param5).exec(TensorShapeArray{{11, 5, 5, 5, 5, 11}, {}}); + checker.set_param(param5).exec(TensorShapeArray{{11, 5, 5, 5, 5, 7, 7}, {}}); + Softmax::Param param6{6}; + checker.set_param(param6).exec(TensorShapeArray{{11, 5, 5, 5, 5, 7, 7}, {}}); +} + +} // namespace test +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/test/naive/softmax.cpp b/dnn/test/naive/softmax.cpp index 4ac616bc5..4bf853898 100644 --- a/dnn/test/naive/softmax.cpp +++ b/dnn/test/naive/softmax.cpp @@ -43,60 +43,3 @@ TEST_F(NAIVE, SOFTMAX_BACKWARD) { checker.set_param(param).exect(Testcase{input, diff, {}}, Testcase{{}, {}, output}); } - -TEST_F(NAIVE, SOFTMAX_FORWARD_NHWCD4) { - Checker checker(handle(), false); - Softmax::Param param{0}; - - TensorND input1 = TensorValue( - {1, 2, 1, 2, 4}, dtype::Float32(), - {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); - TensorND output1 = TensorValue( - {1, 2, 1, 2, 4}, dtype::Float32(), - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - checker.set_param(param).exect(Testcase{input1, {}}, Testcase{{}, output1}); - - TensorND input2 = TensorValue( - {2, 2, 1, 2, 4}, dtype::Float32(), - {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, - 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31}); - TensorND output2 = TensorValue( - {2, 2, 1, 2, 4}, dtype::Float32(), - {1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, - 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, - 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, - 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, - 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, - 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, - 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, - 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01}); - checker.set_param(param).exect(Testcase{input2, {}}, Testcase{{}, output2}); -} - -TEST_F(NAIVE, SOFTMAX_BACKWARD_NHWCD4) { - Checker checker(handle(), false); - Softmax::Param param{0}; - - TensorND input = TensorValue( - {2, 2, 1, 2, 4}, dtype::Float32(), - {1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, - 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, - 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, - 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, - 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, - 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, - 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, - 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01}); - - TensorND diff = TensorValue( - {2, 2, 1, 2, 4}, dtype::Float32(), - {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}); - - TensorND output = TensorValue( - {2, 2, 1, 2, 4}, dtype::Float32(), - {0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., - 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.}); - - checker.set_param(param).exect(Testcase{input, diff, {}}, Testcase{{}, {}, output}); -} -- GitLab