From 6982006f36f8e9e530b1ed63d7736715f3f21c7f Mon Sep 17 00:00:00 2001 From: yejianwu Date: Wed, 28 Mar 2018 11:13:35 +0800 Subject: [PATCH] add compatible method to space2depth and depth2space ops --- mace/kernels/opencl/cl/depth_to_space.cl | 35 +++++++++++++ mace/kernels/opencl/depth_to_space_opencl.cc | 54 ++++++++++++-------- 2 files changed, 68 insertions(+), 21 deletions(-) diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl index 824f8266..349b665d 100644 --- a/mace/kernels/opencl/cl/depth_to_space.cl +++ b/mace/kernels/opencl/cl/depth_to_space.cl @@ -3,11 +3,28 @@ __kernel void depth_to_space(__read_only image2d_t input, __private const int block_size, __private const int output_depth, +#ifndef USE_QUALCOMM_OPENCL_2_0 + __write_only image2d_t output, + __private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2) { +#else __write_only image2d_t output) { +#endif + const int out_d = get_global_id(0); const int out_w = get_global_id(1); const int out_h = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 + if (out_d >= global_size_dim0 || out_w >= global_size_dim1 + || out_h >= global_size_dim2) { + return; + } + const int output_width = global_size_dim1; +#else const int output_width = get_global_size(1); +#endif const int out_pos = mad24(out_d, output_width, out_w); @@ -30,11 +47,29 @@ __kernel void depth_to_space(__read_only image2d_t input, __kernel void space_to_depth(__read_only image2d_t input, __private const int block_size, __private const int input_depth, +#ifndef USE_QUALCOMM_OPENCL_2_0 + __write_only image2d_t output, + __private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2) { +#else __write_only image2d_t output) { +#endif + const int d = get_global_id(0); const int w = get_global_id(1); const int h = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 + if (d >= global_size_dim0 || w >= global_size_dim1 + || h >= global_size_dim2) { + return; + } + const int input_width = global_size_dim1; +#else const int input_width = get_global_size(1); +#endif + const int in_pos = mad24(d, input_width, w); const int output_width = input_width / block_size; diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc index c39c1a34..0bafecd8 100644 --- a/mace/kernels/opencl/depth_to_space_opencl.cc +++ b/mace/kernels/opencl/depth_to_space_opencl.cc @@ -45,8 +45,11 @@ void DepthToSpaceOpFunctor::operator()( CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); output->ResizeImage(output_shape, image_shape); + auto runtime = OpenCLRuntime::Global(); + + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::stringstream kernel_name_ss; @@ -55,38 +58,47 @@ void DepthToSpaceOpFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } kernel_ = runtime->BuildKernel("depth_to_space", kernel_name, built_options); } + + uint32_t gws[3]; + std::stringstream ss; if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, block_size_); kernel_.setArg(idx++, depth_blocks); kernel_.setArg(idx++, *(output->opencl_image())); + + if (d2s_) { + gws[0] = static_cast(depth_blocks); + gws[1] = static_cast(output_width); + gws[2] = static_cast(output_height * batch); + ss << "depth_to_space_opencl_kernel_" << output->dim(0) << "_" + << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); + } else { + gws[0] = static_cast(depth_blocks); + gws[1] = static_cast(input_width); + gws[2] = static_cast(input_height * batch); + ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_" + << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3); + } + + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + kernel_.setArg(idx++, gws[2]); + input_shape_ = input->shape(); } - if (d2s_) { - const uint32_t gws[3] = {static_cast(depth_blocks), - static_cast(output_width), - static_cast(output_height * batch)}; - const std::vector lws = {8, 16, 8, 1}; - std::stringstream ss; - ss << "depth_to_space_opencl_kernel_" << output->dim(0) << "_" - << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); - } else { - const uint32_t gws[3] = {static_cast(depth_blocks), - static_cast(input_width), - static_cast(input_height * batch)}; - const std::vector lws = {8, 16, 8, 1}; - std::stringstream ss; - ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_" - << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3); - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); - } + const uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + const std::vector lws = {8, kwg_size / 64, 8, 1}; + TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); } template struct DepthToSpaceOpFunctor; -- GitLab