diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu index 260783c4388fcb2c735925899af431805bbf4a6d..21ab8bb3ccdd0cc742771c3535ecb01d25c8e978 100644 --- a/modules/dnn/src/cuda/eltwise_ops.cu +++ b/modules/dnn/src/cuda/eltwise_ops.cu @@ -102,6 +102,26 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { v_store(output_vPtr[i], vec_x); } } + + template + __global__ void eltwise_div_2_vec(Span output, View x, View y) { + using vector_type = get_vector_type_t; + + auto output_vPtr = vector_type::get_pointer(output.data()); + auto x_vPtr = vector_type::get_pointer(x.data()); + auto y_vPtr = vector_type::get_pointer(y.data()); + + for (auto i : grid_stride_range(output.size() / vector_type::size())) { + vector_type vec_x, vec_y; + v_load(vec_x, x_vPtr[i]); + v_load(vec_y, y_vPtr[i]); + + for (int j = 0; j < vector_type::size(); j++) + vec_x.data[j] = vec_x.data[j] / vec_y.data[j]; + + v_store(output_vPtr[i], vec_x); + } + } } template @@ -221,4 +241,32 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template void eltwise_prod_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y); template void eltwise_prod_2(const Stream& stream, Span output, View x, View y); + template + void launch_vectorized_eltwise_div_2(const Stream& stream, Span output, View x, View y) { + CV_Assert(is_fully_aligned(output, N)); + CV_Assert(is_fully_aligned(x, N)); + CV_Assert(is_fully_aligned(y, N)); + + auto kernel = raw::eltwise_div_2_vec; + auto policy = make_policy(kernel, output.size() / N, 0, stream); + launch_kernel(kernel, policy, output, x, y); + } + + template + void eltwise_div_2(const Stream& stream, Span output, View x, View y) { + CV_Assert(x.size() == y.size()); + CV_Assert(x.size() == output.size()); + + if (is_fully_aligned(output, 4) && is_fully_aligned(x, 4) && is_fully_aligned(y, 4)) { + launch_vectorized_eltwise_div_2(stream, output, x, y); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(x, 2) && is_fully_aligned(y, 2)) { + launch_vectorized_eltwise_div_2(stream, output, x, y); + } else { + launch_vectorized_eltwise_div_2(stream, output, x, y); + } + } + + template void eltwise_div_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y); + template void eltwise_div_2(const Stream& stream, Span output, View x, View y); + }}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp index 7d84d07028ebd6bd8d0153e66ecc75569e63c3a3..092b1571af5918f3e6d6ea266a41f7a3be72ea75 100644 --- a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp +++ b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp @@ -24,6 +24,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template void eltwise_prod_2(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y); + template + void eltwise_div_2(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y); + }}}} /* namespace cv::dnn::cuda4dnn::kernels */ #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp index c044730daf6807b1cc9938d19aa0092c9f35b3be..fd06d015c0f8170ef7ac14308c97a1651de00122 100644 --- a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp @@ -24,7 +24,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { enum class EltwiseOpType { MAX, SUM, - PRODUCT + PRODUCT, + DIV }; template @@ -64,6 +65,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { { case EltwiseOpType::MAX: kernels::eltwise_max_2(stream, output, input_x, input_y); break; case EltwiseOpType::PRODUCT: kernels::eltwise_prod_2(stream, output, input_x, input_y); break; + case EltwiseOpType::DIV: kernels::eltwise_div_2(stream, output, input_x, input_y); break; case EltwiseOpType::SUM: if (coeffs.empty() || (coeffs[0] == 1 && coeffs[1] == 1)) kernels::eltwise_sum_2(stream, output, input_x, input_y); @@ -89,6 +91,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { { case EltwiseOpType::MAX: kernels::eltwise_max_2(stream, output, output, input); break; case EltwiseOpType::PRODUCT: kernels::eltwise_prod_2(stream, output, output, input); break; + case EltwiseOpType::DIV: kernels::eltwise_div_2(stream, output, output, input); break; case EltwiseOpType::SUM: if (coeffs.empty() || coeffs[i] == 1) kernels::eltwise_sum_2(stream, output, output, input); diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp index 1eb737c508e45262d423fe2358b852c485b1bd36..52d1849cef3676acd12a39b597c41dbb35da023a 100644 --- a/modules/dnn/src/layers/eltwise_layer.cpp +++ b/modules/dnn/src/layers/eltwise_layer.cpp @@ -108,7 +108,7 @@ public: virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || - (backendId == DNN_BACKEND_CUDA && op != DIV) || // TODO: not implemented, see PR #15811 + backendId == DNN_BACKEND_CUDA || (backendId == DNN_BACKEND_HALIDE && op != DIV) || // TODO: not implemented, see PR #15811 ((((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty())) || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && !variableChannels)); @@ -471,6 +471,7 @@ public: case MAX: return cuda4dnn::EltwiseOpType::MAX; case SUM: return cuda4dnn::EltwiseOpType::SUM; case PROD: return cuda4dnn::EltwiseOpType::PRODUCT; + case DIV: return cuda4dnn::EltwiseOpType::DIV; } return cuda4dnn::EltwiseOpType::SUM; }(); diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index 9c21f7fc83c078364e2361b14fe2a549cd9dfb31..ce8a43a5432d0f1f48d94d54e08cc4173a86b96a 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -380,6 +380,7 @@ TEST_P(Test_ONNX_layers, Div) normAssert(ref, out, "", default_l1, default_lInf); expectNoFallbacksFromIE(net); + expectNoFallbacksFromCUDA(net); } TEST_P(Test_ONNX_layers, DynamicReshape)