From 9fa65e6ad45ba21eaf40162690ef6330ee6bace5 Mon Sep 17 00:00:00 2001 From: luxuhui Date: Wed, 28 Oct 2020 09:12:04 +0800 Subject: [PATCH] opt: optimize the performance of `Reduce` OP N/A Signed-off-by: Luxuhui --- mace/ops/opencl/cl/reduce.cl | 125 +++++++----------- mace/ops/opencl/image/reduce.cc | 222 +++++++++++++++++++------------- mace/ops/opencl/image/reduce.h | 9 ++ 3 files changed, 186 insertions(+), 170 deletions(-) diff --git a/mace/ops/opencl/cl/reduce.cl b/mace/ops/opencl/cl/reduce.cl index 8f14a30c..f1e10993 100644 --- a/mace/ops/opencl/cl/reduce.cl +++ b/mace/ops/opencl/cl/reduce.cl @@ -1,102 +1,69 @@ #include +#if REDUCE_TYPE == 1 +#define INIT_REDUCE_VALUE (DATA_TYPE4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT} +#define REDUCE_VALUE(x, y) fmin(x, y) +#elif REDUCE_TYPE == 2 // MAX +#define INIT_REDUCE_VALUE (DATA_TYPE4){-MAXFLOAT, -MAXFLOAT, -MAXFLOAT, -MAXFLOAT} +#define REDUCE_VALUE(x, y) fmax(x, y) +#elif REDUCE_TYPE == 3 // PROD +#define INIT_REDUCE_VALUE (DATA_TYPE4){1, 1, 1, 1} +#define REDUCE_VALUE(x, y) (x * y) +#else // MEAN or SUM +#define INIT_REDUCE_VALUE (DATA_TYPE4){0, 0, 0, 0} +#define REDUCE_VALUE(x, y) (x + y) +#endif + + __kernel void reduce(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, - __local float4 *local_buffer, - __private const int group_num, - __private const int compute_size, - __private const int last_index, + __private const int out_height, + __private const int out_width, __private const int in_height, __private const int in_width, - __private const float scale, + __private const int org_height, + __private const int org_width, __private const int channel_blocks, __write_only image2d_t output) { - const int w = get_local_id(0); - const int h = get_local_id(1); + const int ow = get_global_id(0); + const int oh = get_global_id(1); const int bc = get_global_id(2); - #ifndef NON_UNIFORM_WORK_GROUP if (bc >= global_size_dim2) return; #endif - const int width = get_local_size(0); - const int index = mad24(h, width, w); + const int b = bc / channel_blocks; - const int ch = mad24(b, -channel_blocks, bc); + const int c = bc % channel_blocks; + const int tile_w = in_width / out_width; + const int tile_h = in_height / out_height; + const int start_w = tile_w * ow; + const int start_h = tile_h * oh; - DATA_TYPE4 in; + const int size_w = select(tile_w, in_width - start_w, ow >= out_width - 1); + const int size_h = select(tile_h, in_height - start_h, oh >= out_height - 1); + const int end_h = start_h + size_h; + const int end_w = start_w + size_w; -#if REDUCE_TYPE == 1 - DATA_TYPE4 part_result = (DATA_TYPE4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT}; -#elif REDUCE_TYPE == 2 - DATA_TYPE4 part_result = (DATA_TYPE4){-MAXFLOAT, -MAXFLOAT, -MAXFLOAT, -MAXFLOAT}; -#elif REDUCE_TYPE == 3 - DATA_TYPE4 part_result = (DATA_TYPE4){1, 1, 1, 1}; -#else - DATA_TYPE4 part_result = (DATA_TYPE4){0, 0, 0, 0}; -#endif - const bool after_last = (last_index > 0 && index >= last_index); - // After last index, each kernel only computes (compute_size - 1) elements. - const int actual_compute_size = select(compute_size, - compute_size - 1, - after_last); - const int base_offset = mul24(index, actual_compute_size); - const int offset= select(base_offset, - base_offset + last_index, - after_last); + DATA_TYPE4 in; + DATA_TYPE4 out = INIT_REDUCE_VALUE; #pragma unroll - for (int i = 0; i < actual_compute_size; ++i) { - int element_idx = offset + i; - int h_idx = element_idx / in_width; - int w_idx = mad24(h_idx, -in_width, element_idx); - int pos_x = mad24(ch, in_width, w_idx); - int pos_y = mad24(b, in_height, h_idx); - in = READ_IMAGET(input, SAMPLER, (int2)(pos_x, pos_y)); -// MIN -#if REDUCE_TYPE == 1 - part_result = fmin(part_result, in); -// MAX -#elif REDUCE_TYPE == 2 - part_result = fmax(part_result, in); -// PROD -#elif REDUCE_TYPE == 3 - part_result = part_result * in; -// MEAN or SUM -#else - part_result = part_result + in; -#endif + for (int h = start_h; h < end_h; ++h) { + for (int w = start_w; w < end_w; ++w) { + int pos_x = mad24(c, in_width, w); + int pos_y = mad24(b, in_height, h); + in = READ_IMAGET(input, SAMPLER, (int2)(pos_x, pos_y)); + out = REDUCE_VALUE(out, in); + } } - #if REDUCE_TYPE == 0 - part_result = part_result * scale; + if (out_height == 1 && out_width == 1) { + out = out / (org_height * org_width); + } #endif - local_buffer[index] = part_result; - barrier(CLK_LOCAL_MEM_FENCE); - - if (w == 0 && h == 0) { -#if REDUCE_TYPE == 1 - DATA_TYPE4 out = (DATA_TYPE4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT}; -#elif REDUCE_TYPE == 2 - DATA_TYPE4 out = (DATA_TYPE4){-MAXFLOAT, -MAXFLOAT, -MAXFLOAT, -MAXFLOAT}; -#elif REDUCE_TYPE == 3 - DATA_TYPE4 out = (DATA_TYPE4){1, 1, 1, 1}; -#else - DATA_TYPE4 out = (DATA_TYPE4){0, 0, 0, 0}; -#endif -#pragma unroll - for (int i = 0; i < group_num; ++i) { -#if REDUCE_TYPE == 1 - out = fmin(out, local_buffer[i]); -#elif REDUCE_TYPE == 2 - out = fmax(out, local_buffer[i]); -#elif REDUCE_TYPE == 3 - out = out * local_buffer[i]; -#else - out = out + local_buffer[i]; -#endif - } - WRITE_IMAGET(output, (int2)(ch, b), out); - } + int pos_x = mad24(c, out_width, ow); + int pos_y = mad24(b, out_height, oh); + WRITE_IMAGET(output, (int2)(pos_x, pos_y), out); } diff --git a/mace/ops/opencl/image/reduce.cc b/mace/ops/opencl/image/reduce.cc index 95a1d567..b42e773d 100644 --- a/mace/ops/opencl/image/reduce.cc +++ b/mace/ops/opencl/image/reduce.cc @@ -15,124 +15,164 @@ #include "mace/ops/opencl/image/reduce.h" #include +#include +#include namespace mace { namespace ops { namespace opencl { namespace image { +namespace { +const index_t TILE_SIZE = 16; + +cl::Image *InitScratchImageAndGetPointer(OpContext *context, DataType dtype, + ScratchImage *scratch_image, + const std::vector &shape) { + std::vector image_shape; + OpenCLUtil::CalImage2DShape(shape, OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); + + auto mace_image = scratch_image->Scratch( + context->device()->allocator(), image_shape, dtype); + cl::Image *image = static_cast(mace_image->buffer()); + + return image; +} + +} // namespace + +MaceStatus ReduceKernel::BuildReduceKernel(OpenCLRuntime *runtime) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce"); + built_options.emplace("-Dreduce=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel( + "reduce", kernel_name, built_options, &kernel_)); + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus ReduceKernel::GraduallyComputeReduce( + OpContext *context, const index_t batch, const index_t channel_blocks, + const index_t in_height, const index_t in_width, + const index_t out_height, const index_t out_width, + const index_t org_height, const index_t org_width, + const cl::Image *input, cl::Image *output) { + MACE_OUT_OF_RANGE_DEFINITION; + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + if (kernel_.get() == nullptr) { + MACE_RETURN_IF_ERROR(BuildReduceKernel(runtime)); + } + + const uint32_t gws[3] = {static_cast(out_width), + static_cast(out_height), + static_cast(batch * channel_blocks)}; + std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + + MACE_OUT_OF_RANGE_INIT(kernel_); + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *input); + kernel_.setArg(idx++, static_cast(out_height)); + kernel_.setArg(idx++, static_cast(out_width)); + kernel_.setArg(idx++, static_cast(in_height)); + kernel_.setArg(idx++, static_cast(in_width)); + kernel_.setArg(idx++, static_cast(org_height)); + kernel_.setArg(idx++, static_cast(org_width)); + kernel_.setArg(idx++, static_cast(channel_blocks)); + kernel_.setArg(idx++, *output); + + std::string tuning_key = Concat( + "reduce_opencl_kernel", gws[0], gws[1], gws[2]); + + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + MACE_OUT_OF_RANGE_VALIDATION; + + return MaceStatus::MACE_SUCCESS; +} + MaceStatus ReduceKernel::Compute( OpContext *context, const Tensor *input, Tensor *output) { MACE_CHECK_NOTNULL(input); - index_t batch = input->dim(0); - const index_t in_height = input->dim(1); - const index_t in_width = input->dim(2); + const index_t batch = input->dim(0); + const index_t org_height = input->dim(1); + const index_t org_width = input->dim(2); + index_t in_height = org_height; + index_t in_width = org_width; const index_t channels = input->dim(3); const index_t channel_blocks = RoundUpDiv4(channels); - const uint32_t image_size = static_cast(in_height * in_width); - std::vector gws(3); - std::vector lws(3); std::vector output_shape{batch, 1, 1, channels}; std::vector output_image_shape; OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - auto runtime = context->device()->gpu_runtime()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce"); - built_options.emplace("-Dreduce=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); - built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_)); - if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { - built_options.emplace("-DNON_QUALCOMM_ADRENO"); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce", - kernel_name, - built_options, - &kernel_)); - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - // In the reduce.cl file, the computation is divided into two steps. - // The first step computes `compute_size` times parallelly, and the second - // step computes `group_num` times. In order to speed up the computation, we - // make the computation times of these two steps as uniform as possible. - uint32_t local_wg_size = static_cast(sqrt(in_height * in_width)); - // Increase the times of the second step for it's not parallel - local_wg_size *= 2; - local_wg_size = std::min(local_wg_size, kwg_size_); - gws = {4, local_wg_size / 4, static_cast(batch * channel_blocks)}; - if (gws[1] == 0) { - gws[1] = 1; - } + MaceStatus result = MaceStatus::MACE_RUNTIME_ERROR; + if (in_height <= TILE_SIZE && in_width <= TILE_SIZE) { + result = GraduallyComputeReduce(context, batch, channel_blocks, in_height, + in_width, 1, 1, org_height, org_width, + input->opencl_image(), + output->opencl_image()); + } else { + ScratchImageManager *scratch_manager = + context->device()->gpu_runtime()->scratch_image_manager(); + ScratchImage scratch_inter_image(scratch_manager); + auto out_height = RoundUpDiv(in_height, TILE_SIZE); + auto out_width = RoundUpDiv(in_width, TILE_SIZE); + const std::vector inter_shape = + {{batch, out_height, out_width, channels}}; + cl::Image *inter_image = InitScratchImageAndGetPointer( + context, input->dtype(), &scratch_inter_image, inter_shape); + result = GraduallyComputeReduce(context, batch, channel_blocks, in_height, + in_width, out_height, out_width, + org_height, org_width, + input->opencl_image(), inter_image); + MACE_RETURN_IF_ERROR(result); - lws = {gws[0], gws[1], 1}; - const int group_num = lws[0] * lws[1] * lws[2]; - // Each kernel intends to compute compute_size elements. - const int compute_size = (image_size + group_num - 1) / group_num; - const int last_index = image_size % group_num; - const float scale = 1.f / (in_width * in_height); + in_height = out_height; + in_width = out_width; + out_height = RoundUpDiv(in_height, TILE_SIZE); + out_width = RoundUpDiv(in_width, TILE_SIZE); - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_3D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, (group_num * 4 * sizeof(float)), - nullptr); - kernel_.setArg(idx++, static_cast(group_num)); - kernel_.setArg(idx++, static_cast(compute_size)); - kernel_.setArg(idx++, static_cast(last_index)); - kernel_.setArg(idx++, static_cast(in_height)); - kernel_.setArg(idx++, static_cast(in_width)); - kernel_.setArg(idx++, scale); - kernel_.setArg(idx++, static_cast(channel_blocks)); - kernel_.setArg(idx++, *(output->opencl_image())); - - input_shape_ = input->shape(); - } + if (in_height > TILE_SIZE || in_width > TILE_SIZE) { + ScratchImage scratch_inter2_image(scratch_manager); + const std::vector inter2_shape = + {{batch, out_height, out_width, channels}}; + cl::Image *inter2_image = InitScratchImageAndGetPointer( + context, input->dtype(), &scratch_inter2_image, inter2_shape); - cl::Event event; - cl_int error; - if (runtime->IsNonUniformWorkgroupsSupported()) { - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } else { - std::vector roundup_gws(lws.size()); - for (size_t i = 0; i < lws.size(); ++i) { - roundup_gws[i] = RoundUp(gws[i], lws[i]); + while (out_height > 1 || out_width > 1) { + result = GraduallyComputeReduce(context, batch, channel_blocks, + in_height, in_width, out_height, + out_width, org_height, org_width, + inter_image, inter2_image); + MACE_RETURN_IF_ERROR(result); + in_height = out_height; + in_width = out_width; + out_height = RoundUpDiv(in_height, TILE_SIZE); + out_width = RoundUpDiv(in_width, TILE_SIZE); + std::swap(inter_image, inter2_image); + } } - error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); - } - MACE_CL_RET_STATUS(error); - MACE_OUT_OF_RANGE_VALIDATION; - if (context->future() != nullptr) { - context->future()->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; + result = GraduallyComputeReduce(context, batch, channel_blocks, in_height, + in_width, 1, 1, org_height, org_width, + inter_image, output->opencl_image()); } - return MaceStatus::MACE_SUCCESS; + return result; } } // namespace image diff --git a/mace/ops/opencl/image/reduce.h b/mace/ops/opencl/image/reduce.h index 24e889d7..9e9c9065 100644 --- a/mace/ops/opencl/image/reduce.h +++ b/mace/ops/opencl/image/reduce.h @@ -42,6 +42,15 @@ class ReduceKernel : public OpenCLReduceKernel { const Tensor *input, Tensor *output) override; + private: + MaceStatus BuildReduceKernel(OpenCLRuntime *runtime); + MaceStatus GraduallyComputeReduce( + OpContext *context, const index_t batch, const index_t channel_blocks, + const index_t in_height, const index_t in_width, + const index_t out_height, const index_t out_width, + const index_t org_height, const index_t org_width, + const cl::Image *input, cl::Image *output); + private: ReduceType reduce_type_; const std::vector axis_; -- GitLab