// Copyright 2018 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "mace/ops/opencl/image/reduce.h" #include #include #include namespace mace { namespace ops { namespace opencl { namespace image { namespace { const index_t TILE_SIZE = 16; cl::Image *InitScratchImageAndGetPointer(OpContext *context, DataType dtype, ScratchImage *scratch_image, const std::vector &shape) { std::vector image_shape; OpenCLUtil::CalImage2DShape(shape, OpenCLBufferType::IN_OUT_CHANNEL, &image_shape); auto mace_image = scratch_image->Scratch( context->device()->allocator(), image_shape, dtype); cl::Image *image = static_cast(mace_image->buffer()); return image; } } // namespace MaceStatus ReduceKernel::BuildReduceKernel(OpenCLRuntime *runtime) { std::set built_options; MACE_OUT_OF_RANGE_CONFIG; MACE_NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce"); built_options.emplace("-Dreduce=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_)); MACE_RETURN_IF_ERROR(runtime->BuildKernel( "reduce", kernel_name, built_options, &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); return MaceStatus::MACE_SUCCESS; } MaceStatus ReduceKernel::GraduallyComputeReduce( OpContext *context, const index_t batch, const index_t channel_blocks, const index_t in_height, const index_t in_width, const index_t out_height, const index_t out_width, const index_t org_height, const index_t org_width, const cl::Image *input, cl::Image *output) { MACE_OUT_OF_RANGE_DEFINITION; auto runtime = context->device()->gpu_runtime()->opencl_runtime(); if (kernel_.get() == nullptr) { MACE_RETURN_IF_ERROR(BuildReduceKernel(runtime)); } const uint32_t gws[3] = {static_cast(out_width), static_cast(out_height), static_cast(batch * channel_blocks)}; std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); MACE_OUT_OF_RANGE_INIT(kernel_); uint32_t idx = 0; MACE_OUT_OF_RANGE_SET_ARGS(kernel_); MACE_SET_3D_GWS_ARGS(kernel_, gws); kernel_.setArg(idx++, *input); kernel_.setArg(idx++, static_cast(out_height)); kernel_.setArg(idx++, static_cast(out_width)); kernel_.setArg(idx++, static_cast(in_height)); kernel_.setArg(idx++, static_cast(in_width)); kernel_.setArg(idx++, static_cast(org_height)); kernel_.setArg(idx++, static_cast(org_width)); kernel_.setArg(idx++, static_cast(channel_blocks)); kernel_.setArg(idx++, *output); std::string tuning_key = Concat( "reduce_opencl_kernel", gws[0], gws[1], gws[2]); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; return MaceStatus::MACE_SUCCESS; } MaceStatus ReduceKernel::Compute( OpContext *context, const Tensor *input, Tensor *output) { MACE_CHECK_NOTNULL(input); const index_t batch = input->dim(0); const index_t org_height = input->dim(1); const index_t org_width = input->dim(2); index_t in_height = org_height; index_t in_width = org_width; const index_t channels = input->dim(3); const index_t channel_blocks = RoundUpDiv4(channels); std::vector output_shape{batch, 1, 1, channels}; std::vector output_image_shape; OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); MaceStatus result = MaceStatus::MACE_RUNTIME_ERROR; if (in_height <= TILE_SIZE && in_width <= TILE_SIZE) { result = GraduallyComputeReduce(context, batch, channel_blocks, in_height, in_width, 1, 1, org_height, org_width, input->opencl_image(), output->opencl_image()); } else { ScratchImageManager *scratch_manager = context->device()->gpu_runtime()->scratch_image_manager(); ScratchImage scratch_inter_image(scratch_manager); auto out_height = RoundUpDiv(in_height, TILE_SIZE); auto out_width = RoundUpDiv(in_width, TILE_SIZE); const std::vector inter_shape = {{batch, out_height, out_width, channels}}; cl::Image *inter_image = InitScratchImageAndGetPointer( context, input->dtype(), &scratch_inter_image, inter_shape); result = GraduallyComputeReduce(context, batch, channel_blocks, in_height, in_width, out_height, out_width, org_height, org_width, input->opencl_image(), inter_image); MACE_RETURN_IF_ERROR(result); in_height = out_height; in_width = out_width; out_height = RoundUpDiv(in_height, TILE_SIZE); out_width = RoundUpDiv(in_width, TILE_SIZE); if (in_height > TILE_SIZE || in_width > TILE_SIZE) { ScratchImage scratch_inter2_image(scratch_manager); const std::vector inter2_shape = {{batch, out_height, out_width, channels}}; cl::Image *inter2_image = InitScratchImageAndGetPointer( context, input->dtype(), &scratch_inter2_image, inter2_shape); while (out_height > 1 || out_width > 1) { result = GraduallyComputeReduce(context, batch, channel_blocks, in_height, in_width, out_height, out_width, org_height, org_width, inter_image, inter2_image); MACE_RETURN_IF_ERROR(result); in_height = out_height; in_width = out_width; out_height = RoundUpDiv(in_height, TILE_SIZE); out_width = RoundUpDiv(in_width, TILE_SIZE); std::swap(inter_image, inter2_image); } } result = GraduallyComputeReduce(context, batch, channel_blocks, in_height, in_width, 1, 1, org_height, org_width, inter_image, output->opencl_image()); } return result; } } // namespace image } // namespace opencl } // namespace ops } // namespace mace