// Copyright 2018 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "mace/ops/opencl/image/reduce.h" #include namespace mace { namespace ops { namespace opencl { namespace image { MaceStatus ReduceKernel::Compute( OpContext *context, const Tensor *input, Tensor *output) { MACE_CHECK_NOTNULL(input); index_t batch = input->dim(0); const index_t in_height = input->dim(1); const index_t in_width = input->dim(2); const index_t channels = input->dim(3); const index_t channel_blocks = RoundUpDiv4(channels); const uint32_t image_size = static_cast(in_height * in_width); std::vector gws(3); std::vector lws(3); std::vector output_shape{batch, 1, 1, channels}; std::vector output_image_shape; OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); auto runtime = context->device()->gpu_runtime()->opencl_runtime(); MACE_OUT_OF_RANGE_DEFINITION; if (kernel_.get() == nullptr) { std::set built_options; MACE_OUT_OF_RANGE_CONFIG; MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce"); built_options.emplace("-Dreduce=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_)); if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { built_options.emplace("-DNON_QUALCOMM_ADRENO"); } MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce", kernel_name, built_options, &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } // In the reduce.cl file, the computation is divided into two steps. // The first step computes `compute_size` times parallelly, and the second // step computes `group_num` times. In order to speed up the computation, we // make the computation times of these two steps as uniform as possible. uint32_t local_wg_size = static_cast(sqrt(in_height * in_width)); // Increase the times of the second step for it's not parallel local_wg_size *= 2; local_wg_size = std::min(local_wg_size, kwg_size_); gws = {4, local_wg_size / 4, static_cast(batch * channel_blocks)}; if (gws[1] == 0) { gws[1] = 1; } lws = {gws[0], gws[1], 1}; const int group_num = lws[0] * lws[1] * lws[2]; // Each kernel intends to compute compute_size elements. const int compute_size = (image_size + group_num - 1) / group_num; const int last_index = image_size % group_num; const float scale = 1.f / (in_width * in_height); MACE_OUT_OF_RANGE_INIT(kernel_); if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; MACE_OUT_OF_RANGE_SET_ARGS(kernel_); MACE_SET_3D_GWS_ARGS(kernel_, gws); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, (group_num * 4 * sizeof(float)), nullptr); kernel_.setArg(idx++, static_cast(group_num)); kernel_.setArg(idx++, static_cast(compute_size)); kernel_.setArg(idx++, static_cast(last_index)); kernel_.setArg(idx++, static_cast(in_height)); kernel_.setArg(idx++, static_cast(in_width)); kernel_.setArg(idx++, scale); kernel_.setArg(idx++, static_cast(channel_blocks)); kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); } cl::Event event; cl_int error; if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } else { std::vector roundup_gws(lws.size()); for (size_t i = 0; i < lws.size(); ++i) { roundup_gws[i] = RoundUp(gws[i], lws[i]); } error = runtime->command_queue().enqueueNDRangeKernel( kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } MACE_CL_RET_STATUS(error); MACE_OUT_OF_RANGE_VALIDATION; if (context->future() != nullptr) { context->future()->wait_fn = [runtime, event](CallStats *stats) { event.wait(); if (stats != nullptr) { runtime->GetCallStats(event, stats); } }; } return MaceStatus::MACE_SUCCESS; } } // namespace image } // namespace opencl } // namespace ops } // namespace mace