diff --git a/mace/kernels/batch_to_space.h b/mace/kernels/batch_to_space.h new file mode 100644 index 0000000000000000000000000000000000000000..8198d5766406afcb8365724057173196b1bc87de --- /dev/null +++ b/mace/kernels/batch_to_space.h @@ -0,0 +1,209 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_BATCH_TO_SPACE_H_ +#define MACE_KERNELS_BATCH_TO_SPACE_H_ + +#include +#include +#include + +#include "mace/core/future.h" +#include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" +#include "mace/public/mace.h" + +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/cl2_header.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace kernels { + +struct BatchToSpaceFunctorBase : OpKernel { + BatchToSpaceFunctorBase(OpKernelContext *context, + const std::vector &paddings, + const std::vector &block_shape) + : OpKernel(context), + paddings_(paddings.begin(), paddings.end()), + block_shape_(block_shape.begin(), block_shape.end()) { + MACE_CHECK( + block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1, + "Block's shape should be 1D, and greater than 1"); + MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D"); + } + + std::vector paddings_; + std::vector block_shape_; + + protected: + void CalculateBatchToSpaceOutputShape(const Tensor *input_tensor, + const DataFormat data_format, + index_t *output_shape) { + MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D"); + index_t batch = input_tensor->dim(0); + index_t channels = 0; + index_t height = 0; + index_t width = 0; + if (data_format == DataFormat::NHWC) { + height = input_tensor->dim(1); + width = input_tensor->dim(2); + channels = input_tensor->dim(3); + } else if (data_format == DataFormat::NCHW) { + height = input_tensor->dim(2); + width = input_tensor->dim(3); + channels = input_tensor->dim(1); + } else { + MACE_NOT_IMPLEMENTED; + } + + index_t new_batch = batch / block_shape_[0] / block_shape_[1]; + index_t new_height = height * block_shape_[0] - paddings_[0] - paddings_[1]; + index_t new_width = width * block_shape_[1] - paddings_[2] - paddings_[3]; + + if (data_format == DataFormat::NHWC) { + output_shape[0] = new_batch; + output_shape[1] = new_height; + output_shape[2] = new_width; + output_shape[3] = channels; + } else { + output_shape[0] = new_batch; + output_shape[1] = channels; + output_shape[2] = new_height; + output_shape[3] = new_width; + } + } +}; + +template +struct BatchToSpaceFunctor; + +template<> +struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { + BatchToSpaceFunctor(OpKernelContext *context, + const std::vector &paddings, + const std::vector &block_shape) + : BatchToSpaceFunctorBase(context, paddings, block_shape) {} + + MaceStatus operator()(Tensor *space_tensor, + Tensor *batch_tensor, + StatsFuture *future) { + MACE_UNUSED(future); + + std::vector output_shape(4, 0); + CalculateBatchToSpaceOutputShape(batch_tensor, + DataFormat::NCHW, + output_shape.data()); + MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape)); + + Tensor::MappingGuard input_guard(space_tensor); + Tensor::MappingGuard output_guard(batch_tensor); + + int pad_top = paddings_[0]; + int pad_left = paddings_[2]; + int block_shape_h = block_shape_[0]; + int block_shape_w = block_shape_[1]; + + const float *input_data = batch_tensor->data(); + float *output_data = space_tensor->mutable_data(); + + index_t in_batches = batch_tensor->dim(0); + index_t in_height = batch_tensor->dim(2); + index_t in_width = batch_tensor->dim(3); + + index_t out_batches = space_tensor->dim(0); + index_t channels = space_tensor->dim(1); + index_t out_height = space_tensor->dim(2); + index_t out_width = space_tensor->dim(3); + + // 32k/sizeof(float)/out_width/block_shape + index_t + block_h_size = + std::max(static_cast(1), 8 * 1024 / block_shape_w / out_width); + + // make channel outter loop so we can make best use of cache +#pragma omp parallel for collapse(3) + for (index_t c = 0; c < channels; ++c) { + for (index_t block_h = 0; block_h < in_height; + block_h += block_h_size) { + for (index_t in_b = 0; in_b < in_batches; ++in_b) { + const index_t b = in_b % out_batches; + const index_t tile_index = in_b / out_batches; + const index_t tile_h = tile_index / block_shape_w; + const index_t tile_w = tile_index % block_shape_w; + const index_t valid_h_start = std::max(block_h, + (pad_top - tile_h + + block_shape_h - 1) + / block_shape_h); + const index_t valid_h_end = std::min(in_height, + std::min( + block_h + block_h_size, + (out_height + pad_top + - tile_h + + block_shape_h - 1) + / block_shape_h)); + const index_t valid_w_start = std::max(static_cast(0), + (pad_left - tile_w + + block_shape_w - 1) + / block_shape_w); + const index_t valid_w_end = std::min(in_width, + (out_width + pad_left - tile_w + + block_shape_w - 1) + / block_shape_w); + const float *input_base = + input_data + (in_b * channels + c) * in_height * in_width; + float *output_base = + output_data + (b * channels + c) * out_height * out_width; + + index_t h = valid_h_start * block_shape_h + tile_h - pad_top; + for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) { + index_t w = valid_w_start * block_shape_w + tile_w - pad_left; + for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) { + output_base[h * out_width + w] = + input_base[in_h * in_width + in_w]; + w += block_shape_w; + } // w + h += block_shape_h; + } // h + } // b + } // block_h + } // c + + return MACE_SUCCESS; + } +}; + +#ifdef MACE_ENABLE_OPENCL +template +struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { + BatchToSpaceFunctor(OpKernelContext *context, + const std::vector &paddings, + const std::vector &block_shape) + : BatchToSpaceFunctorBase(context, paddings, block_shape) {} + + MaceStatus operator()(Tensor *space_tensor, + Tensor *batch_tensor, + StatsFuture *future); + + cl::Kernel kernel_; + uint32_t kwg_size_; + std::unique_ptr kernel_error_; + std::vector space_shape_; +}; +#endif // MACE_ENABLE_OPENCL + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_BATCH_TO_SPACE_H_ diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index 7c4a7456a122b7028360ab560117ed7bce0e9a0a..601da6cc81f35daf664abcd93cc9eb1c643d8416 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -32,9 +32,8 @@ namespace kernels { template struct DepthToSpaceOpFunctor : OpKernel { DepthToSpaceOpFunctor(OpKernelContext *context, - const int block_size, - bool d2s) - : OpKernel(context), block_size_(block_size), d2s_(d2s) {} + const int block_size) + : OpKernel(context), block_size_(block_size) {} MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future) { @@ -44,17 +43,13 @@ struct DepthToSpaceOpFunctor : OpKernel { const index_t input_height = input->dim(2); const index_t input_width = input->dim(3); - index_t output_depth, output_width, output_height; + MACE_CHECK(input_depth % (block_size_ * block_size_) == 0, + "input depth should be dividable by block_size * block_size", + input_depth); - if (d2s_) { - output_depth = input_depth / (block_size_ * block_size_); - output_width = input_width * block_size_; - output_height = input_height * block_size_; - } else { - output_depth = input_depth * block_size_ * block_size_; - output_width = input_width / block_size_; - output_height = input_height / block_size_; - } + const index_t output_depth = input_depth / (block_size_ * block_size_); + const index_t output_width = input_width * block_size_; + const index_t output_height = input_height * block_size_; std::vector output_shape = {batch_size, output_depth, output_height, output_width}; @@ -65,78 +60,49 @@ struct DepthToSpaceOpFunctor : OpKernel { const T *input_ptr = input->data(); T *output_ptr = output->mutable_data(); - if (d2s_) { -#pragma omp parallel for - for (index_t b = 0; b < batch_size; ++b) { - for (index_t d = 0; d < output_depth; ++d) { - for (index_t h = 0; h < output_height; ++h) { - const index_t in_h = h / block_size_; - const index_t offset_h = (h % block_size_); - for (int w = 0; w < output_width; ++w) { - const index_t in_w = w / block_size_; - const index_t offset_w = w % block_size_; - const index_t offset_d = - (offset_h * block_size_ + offset_w) * output_depth; - - const index_t in_d = d + offset_d; - const index_t o_index = - ((b * output_depth + d) * output_height + h) * output_width - + w; - const index_t i_index = - ((b * input_depth + in_d) * input_height + in_h) * input_width - + in_w; - output_ptr[o_index] = input_ptr[i_index]; - } - } - } - } - } else { #pragma omp parallel for - for (index_t b = 0; b < batch_size; ++b) { - for (index_t d = 0; d < input_depth; ++d) { - for (index_t h = 0; h < input_height; ++h) { - const index_t out_h = h / block_size_; - const index_t offset_h = (h % block_size_); - for (index_t w = 0; w < input_width; ++w) { - const index_t out_w = w / block_size_; - const index_t offset_w = (w % block_size_); - const index_t offset_d = - (offset_h * block_size_ + offset_w) * input_depth; - - const index_t out_d = d + offset_d; - const index_t o_index = - ((b * output_depth + out_d) * output_height + out_h) - * output_width + out_w; - const index_t i_index = - ((b * input_depth + d) * input_height + h) * input_width - + w; - output_ptr[o_index] = input_ptr[i_index]; - } + for (index_t b = 0; b < batch_size; ++b) { + for (index_t d = 0; d < output_depth; ++d) { + for (index_t h = 0; h < output_height; ++h) { + const index_t in_h = h / block_size_; + const index_t offset_h = (h % block_size_); + for (int w = 0; w < output_width; ++w) { + const index_t in_w = w / block_size_; + const index_t offset_w = w % block_size_; + const index_t offset_d = + (offset_h * block_size_ + offset_w) * output_depth; + + const index_t in_d = d + offset_d; + const index_t o_index = + ((b * output_depth + d) * output_height + h) * output_width + + w; + const index_t i_index = + ((b * input_depth + in_d) * input_height + in_h) * input_width + + in_w; + output_ptr[o_index] = input_ptr[i_index]; } } } } + return MACE_SUCCESS; } const int block_size_; - bool d2s_; }; #ifdef MACE_ENABLE_OPENCL template struct DepthToSpaceOpFunctor : OpKernel { DepthToSpaceOpFunctor(OpKernelContext *context, - const int block_size, - bool d2s) - : OpKernel(context), block_size_(block_size), d2s_(d2s) {} + const int block_size) + : OpKernel(context), block_size_(block_size) {} MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future); const int block_size_; - bool d2s_; cl::Kernel kernel_; uint32_t kwg_size_; std::unique_ptr kernel_error_; diff --git a/mace/kernels/opencl/batch_to_space.cc b/mace/kernels/opencl/batch_to_space.cc new file mode 100644 index 0000000000000000000000000000000000000000..ec5cf5f7a2da1ff0d7ced9df1e75eaa7bda58707 --- /dev/null +++ b/mace/kernels/opencl/batch_to_space.cc @@ -0,0 +1,104 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_ +#define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_ + +#include "mace/kernels/batch_to_space.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/tuner.h" +#include "mace/utils/utils.h" + +namespace mace { +namespace kernels { + +template +MaceStatus BatchToSpaceFunctor::operator()( + Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) { + std::vector output_shape(4, 0); + CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC, + output_shape.data()); + + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR( + space_tensor->ResizeImage(output_shape, output_image_shape)); + + const uint32_t chan_blk = + static_cast(RoundUpDiv4(batch_tensor->dim(3))); + + const uint32_t gws[3] = { + chan_blk, static_cast(batch_tensor->dim(2)), + static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; + + auto runtime = context_->device()->opencl_runtime(); + + if (kernel_.get() == nullptr) { + const char *kernel_name = "batch_to_space"; + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_, context_); + NON_UNIFORM_WG_CONFIG; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToCLCMDDt(DataTypeToEnum::value)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space", + obfuscated_kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + if (!IsVecEqual(space_shape_, space_tensor->shape())) { + uint32_t idx = 0; + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); + kernel_.setArg(idx++, *(batch_tensor->opencl_image())); + kernel_.setArg(idx++, *(space_tensor->opencl_image())); + kernel_.setArg(idx++, block_shape_[0]); + kernel_.setArg(idx++, block_shape_[1]); + kernel_.setArg(idx++, paddings_[0]); + kernel_.setArg(idx++, paddings_[2]); + kernel_.setArg(idx++, static_cast(space_tensor->dim(0))); + kernel_.setArg(idx++, static_cast(space_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(space_tensor->dim(2))); + kernel_.setArg(idx++, static_cast(batch_tensor->dim(1))); + kernel_.setArg(idx++, static_cast(batch_tensor->dim(2))); + + space_shape_ = space_tensor->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1), + batch_tensor->dim(2), batch_tensor->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + OUT_OF_RANGE_VALIDATION(kernel_error_); + return MACE_SUCCESS; +} + +template struct BatchToSpaceFunctor; +template struct BatchToSpaceFunctor; + +} // namespace kernels +} // namespace mace +#endif // MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_ diff --git a/mace/kernels/opencl/cl/batch_to_space.cl b/mace/kernels/opencl/cl/batch_to_space.cl new file mode 100644 index 0000000000000000000000000000000000000000..7ed0abb8f15b3ad4d636a075f6e8aeb254e56ead --- /dev/null +++ b/mace/kernels/opencl/cl/batch_to_space.cl @@ -0,0 +1,48 @@ +#include + +__kernel void batch_to_space(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __read_only image2d_t batch_data, + __write_only image2d_t space_data, + __private const int block_height, + __private const int block_width, + __private const int padding_height, + __private const int padding_width, + __private const int batch_size, + __private const int space_height, + __private const int space_width, + __private const int batch_height, + __private const int batch_width) { + const int chan_idx = get_global_id(0); + const int batch_w_idx = get_global_id(1); + const int batch_hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1 + || batch_hb_idx >= global_size_dim2) { + return; + } +#endif + + const int batch_b_idx = batch_hb_idx / batch_height; + const int batch_h_idx = batch_hb_idx % batch_height; + + const int block_size = mul24(block_height, block_width); + const int space_b_idx = batch_b_idx % batch_size; + const int remaining_batch_idx = batch_b_idx / batch_size; + const int space_h_idx = (remaining_batch_idx / block_width) + + mul24(batch_h_idx, block_height) - padding_height; + const int space_w_idx = (remaining_batch_idx % block_width) + + mul24(batch_w_idx, block_width) - padding_width; + + if (0 <= space_w_idx && space_w_idx < space_width && + 0 <= space_h_idx && space_h_idx < space_height) { + int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx); + DATA_TYPE4 value = READ_IMAGET(batch_data, SAMPLER, batch_coord); + + int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx, + space_b_idx * space_height + space_h_idx); + + WRITE_IMAGET(space_data, space_coord, value); + } +} diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl index 3fd66f5a35fcec5b1563ecc283552a9e398fc754..7267c61e1c7f9a3a88f04c5a798682ffb5325876 100644 --- a/mace/kernels/opencl/cl/depth_to_space.cl +++ b/mace/kernels/opencl/cl/depth_to_space.cl @@ -39,43 +39,3 @@ __kernel void depth_to_space(KERNEL_ERROR_PARAMS WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data); } - -__kernel void space_to_depth(KERNEL_ERROR_PARAMS - GLOBAL_WORK_GROUP_SIZE_DIM3 - __read_only image2d_t input, - __private const int block_size, - __private const int input_width, - __private const int input_depth_blocks, - __private const int output_hb, - __private const int output_width, - __private const int output_depth_blocks, - __write_only image2d_t output) { - const int d = get_global_id(0); - const int w = get_global_id(1); - const int hb = get_global_id(2); - -#ifndef NON_UNIFORM_WORK_GROUP - if (d >= global_size_dim0 || w >= global_size_dim1 - || hb >= global_size_dim2) { - return; - } -#endif - - const int in_pos = mad24(d, input_width, w); - - const int out_hb = hb / block_size; - const int offset_h = hb % block_size; - const int out_w = w / block_size; - const int offset_w = w % block_size; - const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks; - const int out_d = d + offset_d; - - if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) { - return; - } - - const int out_pos = mad24(out_d, output_width, out_w); - DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, hb)); - - WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data); -} diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl index c604dacc1dbd42db5fc9c86be174bd30af040563..d6f325d8e8f7277bd11fecbc8d4a167b6ebe369a 100644 --- a/mace/kernels/opencl/cl/space_to_batch.cl +++ b/mace/kernels/opencl/cl/space_to_batch.cl @@ -49,50 +49,3 @@ __kernel void space_to_batch(KERNEL_ERROR_PARAMS WRITE_IMAGET(batch_data, batch_coord, value); } - -__kernel void batch_to_space(KERNEL_ERROR_PARAMS - GLOBAL_WORK_GROUP_SIZE_DIM3 - __read_only image2d_t batch_data, - __write_only image2d_t space_data, - __private const int block_height, - __private const int block_width, - __private const int padding_height, - __private const int padding_width, - __private const int batch_size, - __private const int space_height, - __private const int space_width, - __private const int batch_height, - __private const int batch_width) { - const int chan_idx = get_global_id(0); - const int batch_w_idx = get_global_id(1); - const int batch_hb_idx = get_global_id(2); - -#ifndef NON_UNIFORM_WORK_GROUP - if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1 - || batch_hb_idx >= global_size_dim2) { - return; - } -#endif - - const int batch_b_idx = batch_hb_idx / batch_height; - const int batch_h_idx = batch_hb_idx % batch_height; - - const int block_size = mul24(block_height, block_width); - const int space_b_idx = batch_b_idx % batch_size; - const int remaining_batch_idx = batch_b_idx / batch_size; - const int space_h_idx = (remaining_batch_idx / block_width) + - mul24(batch_h_idx, block_height) - padding_height; - const int space_w_idx = (remaining_batch_idx % block_width) + - mul24(batch_w_idx, block_width) - padding_width; - - if (0 <= space_w_idx && space_w_idx < space_width && - 0 <= space_h_idx && space_h_idx < space_height) { - int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx); - DATA_TYPE4 value = READ_IMAGET(batch_data, SAMPLER, batch_coord); - - int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx, - space_b_idx * space_height + space_h_idx); - - WRITE_IMAGET(space_data, space_coord, value); - } -} diff --git a/mace/kernels/opencl/cl/space_to_depth.cl b/mace/kernels/opencl/cl/space_to_depth.cl new file mode 100644 index 0000000000000000000000000000000000000000..604d665128288c7593a8106d652a609362e8a882 --- /dev/null +++ b/mace/kernels/opencl/cl/space_to_depth.cl @@ -0,0 +1,41 @@ +#include + +__kernel void space_to_depth(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __read_only image2d_t input, + __private const int block_size, + __private const int input_width, + __private const int input_depth_blocks, + __private const int output_hb, + __private const int output_width, + __private const int output_depth_blocks, + __write_only image2d_t output) { + const int d = get_global_id(0); + const int w = get_global_id(1); + const int hb = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (d >= global_size_dim0 || w >= global_size_dim1 + || hb >= global_size_dim2) { + return; + } +#endif + + const int in_pos = mad24(d, input_width, w); + + const int out_hb = hb / block_size; + const int offset_h = hb % block_size; + const int out_w = w / block_size; + const int offset_w = w % block_size; + const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks; + const int out_d = d + offset_d; + + if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) { + return; + } + + const int out_pos = mad24(out_d, output_width, out_w); + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, hb)); + + WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data); +} diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc index f5427af18d5b37887bb2991f0f51b2731c6e7eff..d6d8ba82c1a72c854f8548bf38aa7b324f938541 100644 --- a/mace/kernels/opencl/depth_to_space.cc +++ b/mace/kernels/opencl/depth_to_space.cc @@ -30,54 +30,41 @@ MaceStatus DepthToSpaceOpFunctor::operator()( const index_t input_width = input->dim(2); const index_t input_depth = input->dim(3); - const char *kernel_name = nullptr; + MACE_CHECK(input_depth % (block_size_ * block_size_) == 0, + "input depth should be dividable by block_size * block_size", + input_depth); + MACE_CHECK((input_depth % 4) == 0, + "input channel should be dividable by 4"); + + const index_t output_height = input_height * block_size_; + const index_t output_width = input_width * block_size_; + const index_t output_depth = input_depth / (block_size_ * block_size_); + MACE_CHECK(output_depth % 4 == 0, "output channel not support:") + << output_depth; - uint32_t gws[3]; - std::string tuning_key; - index_t output_height, output_width, output_depth; - if (d2s_) { - output_height = input_height * block_size_; - output_width = input_width * block_size_; - output_depth = input_depth / (block_size_ * block_size_); - MACE_CHECK(output_depth % 4 == 0, "output channel not support:") - << output_depth; - kernel_name = "depth_to_space"; - - gws[0] = static_cast(RoundUpDiv4(output_depth)); - gws[1] = static_cast(output_width); - gws[2] = static_cast(output_height * batch); - tuning_key = Concat("depth_to_space_opencl_kernel", batch, output_height, - output_width, output_depth); - } else { - output_height = input_height / block_size_; - output_width = input_width / block_size_; - output_depth = input_depth * block_size_ * block_size_; - MACE_CHECK(input_depth % 4 == 0, "input channel not support:") - << input_depth; - kernel_name = "space_to_depth"; - - gws[0] = static_cast(RoundUpDiv4(input_depth)); - gws[1] = static_cast(input_width); - gws[2] = static_cast(input_height * batch); - tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0), - input->dim(1), input->dim(2), input->dim(3)); - } const index_t input_depth_blocks = RoundUpDiv4(input_depth); const index_t output_depth_blocks = RoundUpDiv4(output_depth); - std::vector output_shape = {batch, output_height, output_width, + std::vector output_shape = {batch, + output_height, + output_width, output_depth}; - std::vector image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); + const uint32_t gws[3] = { + static_cast(RoundUpDiv4(output_depth)), + static_cast(output_width), + static_cast(output_height * batch) + }; auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; + const char *kernel_name = kernel_name = "depth_to_space"; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; @@ -89,7 +76,6 @@ MaceStatus DepthToSpaceOpFunctor::operator()( obfuscated_kernel_name, built_options, &kernel_)); - kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } @@ -99,26 +85,20 @@ MaceStatus DepthToSpaceOpFunctor::operator()( OUT_OF_RANGE_SET_ARG; SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); - if (d2s_) { - kernel_.setArg(idx++, static_cast(block_size_)); - kernel_.setArg(idx++, static_cast(input_height * batch)); - kernel_.setArg(idx++, static_cast(input_width)); - kernel_.setArg(idx++, static_cast(input_depth_blocks)); - kernel_.setArg(idx++, static_cast(output_width)); - kernel_.setArg(idx++, static_cast(output_depth_blocks)); - } else { - kernel_.setArg(idx++, static_cast(block_size_)); - kernel_.setArg(idx++, static_cast(input_width)); - kernel_.setArg(idx++, static_cast(input_depth_blocks)); - kernel_.setArg(idx++, static_cast(output_height * batch)); - kernel_.setArg(idx++, static_cast(output_width)); - kernel_.setArg(idx++, static_cast(output_depth_blocks)); - } + kernel_.setArg(idx++, static_cast(block_size_)); + kernel_.setArg(idx++, static_cast(input_height * batch)); + kernel_.setArg(idx++, static_cast(input_width)); + kernel_.setArg(idx++, static_cast(input_depth_blocks)); + kernel_.setArg(idx++, static_cast(output_width)); + kernel_.setArg(idx++, static_cast(output_depth_blocks)); kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); } + std::string tuning_key = Concat("depth_to_space_opencl_kernel", + batch, output_height, + output_width, output_depth); const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc index 8794dd2a5ee2cefffaa8cec5b591501b3980c2a8..d015a99e3fc4a83c3f679854b59478d9d8668a79 100644 --- a/mace/kernels/opencl/space_to_batch.cc +++ b/mace/kernels/opencl/space_to_batch.cc @@ -28,27 +28,14 @@ template MaceStatus SpaceToBatchFunctor::operator()( Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) { std::vector output_shape(4, 0); - if (b2s_) { - CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC, - output_shape.data()); - } else { - CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC, - output_shape.data()); - } - - const char *kernel_name = nullptr; + CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC, + output_shape.data()); std::vector output_image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &output_image_shape); - if (b2s_) { - MACE_RETURN_IF_ERROR( - space_tensor->ResizeImage(output_shape, output_image_shape)); - kernel_name = "batch_to_space"; - } else { - MACE_RETURN_IF_ERROR( - batch_tensor->ResizeImage(output_shape, output_image_shape)); - kernel_name = "space_to_batch"; - } + MACE_RETURN_IF_ERROR( + batch_tensor->ResizeImage(output_shape, output_image_shape)); + const char *kernel_name = "space_to_batch"; const uint32_t chan_blk = RoundUpDiv4(batch_tensor->dim(3)); const uint32_t gws[3] = { chan_blk, static_cast(batch_tensor->dim(2)), @@ -79,13 +66,9 @@ MaceStatus SpaceToBatchFunctor::operator()( uint32_t idx = 0; OUT_OF_RANGE_SET_ARG; SET_3D_GWS_ARGS(kernel_); - if (b2s_) { - kernel_.setArg(idx++, *(batch_tensor->opencl_image())); - kernel_.setArg(idx++, *(space_tensor->opencl_image())); - } else { - kernel_.setArg(idx++, *(space_tensor->opencl_image())); - kernel_.setArg(idx++, *(batch_tensor->opencl_image())); - } + + kernel_.setArg(idx++, *(space_tensor->opencl_image())); + kernel_.setArg(idx++, *(batch_tensor->opencl_image())); kernel_.setArg(idx++, block_shape_[0]); kernel_.setArg(idx++, block_shape_[1]); kernel_.setArg(idx++, paddings_[0]); diff --git a/mace/kernels/opencl/space_to_depth.cc b/mace/kernels/opencl/space_to_depth.cc new file mode 100644 index 0000000000000000000000000000000000000000..1135c79cdadae53a8c151673fbbd8567d25dd29a --- /dev/null +++ b/mace/kernels/opencl/space_to_depth.cc @@ -0,0 +1,107 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/space_to_depth.h" +#include "mace/core/runtime/opencl/cl2_header.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/tuner.h" +#include "mace/utils/utils.h" + +namespace mace { +namespace kernels { + +template +MaceStatus SpaceToDepthOpFunctor::operator()( + const Tensor *input, Tensor *output, StatsFuture *future) { + const index_t batch = input->dim(0); + const index_t input_height = input->dim(1); + const index_t input_width = input->dim(2); + const index_t input_depth = input->dim(3); + + MACE_CHECK((input_depth % 4) == 0, + "input channel should be dividable by 4"); + MACE_CHECK( + (input_width % block_size_ == 0) && (input_height % block_size_ == 0), + "input width and height should be dividable by block_size"); + + const index_t output_height = input_height / block_size_; + const index_t output_width = input_width / block_size_; + const index_t output_depth = input_depth * block_size_ * block_size_; + + const index_t input_depth_blocks = RoundUpDiv4(input_depth); + const index_t output_depth_blocks = RoundUpDiv4(output_depth); + + std::vector output_shape = {batch, output_height, output_width, + output_depth}; + + std::vector image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); + + auto runtime = context_->device()->opencl_runtime(); + if (kernel_.get() == nullptr) { + std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_, context_); + NON_UNIFORM_WG_CONFIG; + const char *kernel_name = "space_to_depth"; + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth", + obfuscated_kernel_name, + built_options, + &kernel_)); + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = {static_cast(input_depth_blocks), + static_cast(input_width), + static_cast(input_height * batch)}; + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, static_cast(block_size_)); + kernel_.setArg(idx++, static_cast(input_width)); + kernel_.setArg(idx++, static_cast(input_depth_blocks)); + kernel_.setArg(idx++, static_cast(output_height * batch)); + kernel_.setArg(idx++, static_cast(output_width)); + kernel_.setArg(idx++, static_cast(output_depth_blocks)); + kernel_.setArg(idx++, *(output->opencl_image())); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0), + input->dim(1), input->dim(2), input->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); + + OUT_OF_RANGE_VALIDATION(kernel_error_); + return MACE_SUCCESS; +} + +template struct SpaceToDepthOpFunctor; +template struct SpaceToDepthOpFunctor; + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h index 7670632a2620c1d2552097127251ee5d850047d9..94fdea951a8cd84fd05b5e8712add3bea334ea13 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.h @@ -33,12 +33,10 @@ namespace kernels { struct SpaceToBatchFunctorBase : OpKernel { SpaceToBatchFunctorBase(OpKernelContext *context, const std::vector &paddings, - const std::vector &block_shape, - bool b2s) + const std::vector &block_shape) : OpKernel(context), paddings_(paddings.begin(), paddings.end()), - block_shape_(block_shape.begin(), block_shape.end()), - b2s_(b2s) { + block_shape_(block_shape.begin(), block_shape.end()) { MACE_CHECK( block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1, "Block's shape should be 1D, and greater than 1"); @@ -47,7 +45,6 @@ struct SpaceToBatchFunctorBase : OpKernel { std::vector paddings_; std::vector block_shape_; - bool b2s_; protected: void CalculateSpaceToBatchOutputShape(const Tensor *input_tensor, @@ -93,43 +90,6 @@ struct SpaceToBatchFunctorBase : OpKernel { output_shape[3] = new_width; } } - - void CalculateBatchToSpaceOutputShape(const Tensor *input_tensor, - const DataFormat data_format, - index_t *output_shape) { - MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D"); - index_t batch = input_tensor->dim(0); - index_t channels = 0; - index_t height = 0; - index_t width = 0; - if (data_format == DataFormat::NHWC) { - height = input_tensor->dim(1); - width = input_tensor->dim(2); - channels = input_tensor->dim(3); - } else if (data_format == DataFormat::NCHW) { - height = input_tensor->dim(2); - width = input_tensor->dim(3); - channels = input_tensor->dim(1); - } else { - MACE_NOT_IMPLEMENTED; - } - - index_t new_batch = batch / block_shape_[0] / block_shape_[1]; - index_t new_height = height * block_shape_[0] - paddings_[0] - paddings_[1]; - index_t new_width = width * block_shape_[1] - paddings_[2] - paddings_[3]; - - if (data_format == DataFormat::NHWC) { - output_shape[0] = new_batch; - output_shape[1] = new_height; - output_shape[2] = new_width; - output_shape[3] = channels; - } else { - output_shape[0] = new_batch; - output_shape[1] = channels; - output_shape[2] = new_height; - output_shape[3] = new_width; - } - } }; template @@ -139,9 +99,8 @@ template<> struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { SpaceToBatchFunctor(OpKernelContext *context, const std::vector &paddings, - const std::vector &block_shape, - bool b2s) - : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {} + const std::vector &block_shape) + : SpaceToBatchFunctorBase(context, paddings, block_shape) {} MaceStatus operator()(Tensor *space_tensor, Tensor *batch_tensor, @@ -149,17 +108,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { MACE_UNUSED(future); std::vector output_shape(4, 0); - if (b2s_) { - CalculateBatchToSpaceOutputShape(batch_tensor, - DataFormat::NCHW, - output_shape.data()); - MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape)); - } else { - CalculateSpaceToBatchOutputShape(space_tensor, - DataFormat::NCHW, - output_shape.data()); - MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape)); - } + + CalculateSpaceToBatchOutputShape(space_tensor, + DataFormat::NCHW, + output_shape.data()); + MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape)); Tensor::MappingGuard input_guard(space_tensor); Tensor::MappingGuard output_guard(batch_tensor); @@ -169,152 +122,85 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { int block_shape_h = block_shape_[0]; int block_shape_w = block_shape_[1]; - if (b2s_) { - const float *input_data = batch_tensor->data(); - float *output_data = space_tensor->mutable_data(); + const float *input_data = space_tensor->data(); + float *output_data = batch_tensor->mutable_data(); - index_t in_batches = batch_tensor->dim(0); - index_t in_height = batch_tensor->dim(2); - index_t in_width = batch_tensor->dim(3); + index_t in_batches = space_tensor->dim(0); + index_t in_height = space_tensor->dim(2); + index_t in_width = space_tensor->dim(3); - index_t out_batches = space_tensor->dim(0); - index_t channels = space_tensor->dim(1); - index_t out_height = space_tensor->dim(2); - index_t out_width = space_tensor->dim(3); + index_t out_batches = batch_tensor->dim(0); + index_t channels = batch_tensor->dim(1); + index_t out_height = batch_tensor->dim(2); + index_t out_width = batch_tensor->dim(3); - // 32k/sizeof(float)/out_width/block_shape - index_t - block_h_size = - std::max(static_cast(1), 8 * 1024 / block_shape_w / out_width); + index_t block_h_size = + std::max(static_cast(1), 8 * 1024 / block_shape_w / in_width); - // make channel outter loop so we can make best use of cache + // make channel outter loop so we can make best use of cache #pragma omp parallel for collapse(3) - for (index_t c = 0; c < channels; ++c) { - for (index_t block_h = 0; block_h < in_height; - block_h += block_h_size) { - for (index_t in_b = 0; in_b < in_batches; ++in_b) { - const index_t b = in_b % out_batches; - const index_t tile_index = in_b / out_batches; - const index_t tile_h = tile_index / block_shape_w; - const index_t tile_w = tile_index % block_shape_w; - const index_t valid_h_start = std::max(block_h, - (pad_top - tile_h - + block_shape_h - 1) - / block_shape_h); - const index_t valid_h_end = std::min(in_height, - std::min( - block_h + block_h_size, - (out_height + pad_top - - tile_h - + block_shape_h - 1) - / block_shape_h)); - const index_t valid_w_start = std::max(static_cast(0), - (pad_left - tile_w - + block_shape_w - 1) - / block_shape_w); - const index_t valid_w_end = std::min(in_width, - (out_width + pad_left - tile_w + for (index_t c = 0; c < channels; ++c) { + for (index_t block_h = 0; block_h < out_height; + block_h += block_h_size) { + for (index_t b = 0; b < out_batches; ++b) { + const index_t in_b = b % in_batches; + const index_t tile_index = b / in_batches; + const index_t tile_h = tile_index / block_shape_w; + const index_t tile_w = tile_index % block_shape_w; + const index_t valid_h_start = std::max(block_h, + (pad_top - tile_h + + block_shape_h - 1) + / block_shape_h); + const index_t valid_h_end = std::min(out_height, + std::min( + block_h + block_h_size, + (in_height + pad_top + - tile_h + + block_shape_h - 1) + / block_shape_h)); + const index_t valid_w_start = std::max(static_cast(0), + (pad_left - tile_w + block_shape_w - 1) / block_shape_w); - const float *input_base = - input_data + (in_b * channels + c) * in_height * in_width; - float *output_base = - output_data + (b * channels + c) * out_height * out_width; - - index_t h = valid_h_start * block_shape_h + tile_h - pad_top; - for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) { - index_t w = valid_w_start * block_shape_w + tile_w - pad_left; - for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) { - output_base[h * out_width + w] = - input_base[in_h * in_width + in_w]; - w += block_shape_w; - } // w - h += block_shape_h; - } // h - } // b - } // block_h - } // c - } else { - const float *input_data = space_tensor->data(); - float *output_data = batch_tensor->mutable_data(); - - index_t in_batches = space_tensor->dim(0); - index_t in_height = space_tensor->dim(2); - index_t in_width = space_tensor->dim(3); - - index_t out_batches = batch_tensor->dim(0); - index_t channels = batch_tensor->dim(1); - index_t out_height = batch_tensor->dim(2); - index_t out_width = batch_tensor->dim(3); + const index_t valid_w_end = std::min(out_width, + (in_width + pad_left - tile_w + + block_shape_w - 1) + / block_shape_w); + const float *input_base = + input_data + (in_b * channels + c) * in_height * in_width; + float *output_base = + output_data + (b * channels + c) * out_height * out_width; + + memset(output_base + block_h * out_width, + 0, + (valid_h_start - block_h) * out_width * sizeof(float)); + + index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top; + for (index_t h = valid_h_start; h < valid_h_end; ++h) { + memset(output_base + h * out_width, + 0, + valid_w_start * sizeof(float)); - index_t block_h_size = - std::max(static_cast(1), 8 * 1024 / block_shape_w / in_width); + index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left; + for (index_t w = valid_w_start; w < valid_w_end; ++w) { + output_base[h * out_width + w] = + input_base[in_h * in_width + in_w]; + in_w += block_shape_w; + } // w + in_h += block_shape_h; - // make channel outter loop so we can make best use of cache -#pragma omp parallel for collapse(3) - for (index_t c = 0; c < channels; ++c) { - for (index_t block_h = 0; block_h < out_height; - block_h += block_h_size) { - for (index_t b = 0; b < out_batches; ++b) { - const index_t in_b = b % in_batches; - const index_t tile_index = b / in_batches; - const index_t tile_h = tile_index / block_shape_w; - const index_t tile_w = tile_index % block_shape_w; - const index_t valid_h_start = std::max(block_h, - (pad_top - tile_h - + block_shape_h - 1) - / block_shape_h); - const index_t valid_h_end = std::min(out_height, - std::min( - block_h + block_h_size, - (in_height + pad_top - - tile_h - + block_shape_h - 1) - / block_shape_h)); - const index_t valid_w_start = std::max(static_cast(0), - (pad_left - tile_w - + block_shape_w - 1) - / block_shape_w); - const index_t valid_w_end = std::min(out_width, - (in_width + pad_left - tile_w - + block_shape_w - 1) - / block_shape_w); - const float *input_base = - input_data + (in_b * channels + c) * in_height * in_width; - float *output_base = - output_data + (b * channels + c) * out_height * out_width; - - memset(output_base + block_h * out_width, + memset(output_base + h * out_width + valid_w_end, 0, - (valid_h_start - block_h) * out_width * sizeof(float)); - - index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top; - for (index_t h = valid_h_start; h < valid_h_end; ++h) { - memset(output_base + h * out_width, - 0, - valid_w_start * sizeof(float)); - - index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left; - for (index_t w = valid_w_start; w < valid_w_end; ++w) { - output_base[h * out_width + w] = - input_base[in_h * in_width + in_w]; - in_w += block_shape_w; - } // w - in_h += block_shape_h; - - memset(output_base + h * out_width + valid_w_end, - 0, - (out_width - valid_w_end) * sizeof(float)); - } // h - - memset(output_base + valid_h_end * out_width, - 0, - (std::min(out_height, block_h + block_h_size) - valid_h_end) - * out_width * sizeof(float)); - } // b - } // block_h - } // c - } + (out_width - valid_w_end) * sizeof(float)); + } // h + + memset(output_base + valid_h_end * out_width, + 0, + (std::min(out_height, block_h + block_h_size) - valid_h_end) + * out_width * sizeof(float)); + } // b + } // block_h + } // c return MACE_SUCCESS; } }; @@ -324,9 +210,8 @@ template struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { SpaceToBatchFunctor(OpKernelContext *context, const std::vector &paddings, - const std::vector &block_shape, - bool b2s) - : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {} + const std::vector &block_shape) + : SpaceToBatchFunctorBase(context, paddings, block_shape) {} MaceStatus operator()(Tensor *space_tensor, Tensor *batch_tensor, diff --git a/mace/kernels/space_to_depth.h b/mace/kernels/space_to_depth.h new file mode 100644 index 0000000000000000000000000000000000000000..48dc60781d92a2f58df42bebe30bbf8059887517 --- /dev/null +++ b/mace/kernels/space_to_depth.h @@ -0,0 +1,114 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_SPACE_TO_DEPTH_H_ +#define MACE_KERNELS_SPACE_TO_DEPTH_H_ +#include +#include + +#include "mace/core/future.h" +#include "mace/core/tensor.h" +#include "mace/public/mace.h" +#include "mace/kernels/kernel.h" + +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/cl2_header.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace kernels { + +template +struct SpaceToDepthOpFunctor : OpKernel { + SpaceToDepthOpFunctor(OpKernelContext *context, + const int block_size) + : OpKernel(context), block_size_(block_size) {} + MaceStatus operator()(const Tensor *input, + Tensor *output, + StatsFuture *future) { + MACE_UNUSED(future); + const index_t batch_size = input->dim(0); + const index_t input_depth = input->dim(1); + const index_t input_height = input->dim(2); + const index_t input_width = input->dim(3); + + MACE_CHECK( + (input_width % block_size_ == 0) && (input_height % block_size_ == 0), + "input width and height should be dividable by block_size"); + + const index_t output_depth = input_depth * block_size_ * block_size_; + const index_t output_width = input_width / block_size_; + const index_t output_height = input_height / block_size_; + std::vector output_shape = {batch_size, output_depth, + output_height, output_width}; + + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + Tensor::MappingGuard logits_guard(input); + Tensor::MappingGuard output_guard(output); + const T *input_ptr = input->data(); + T *output_ptr = output->mutable_data(); + +#pragma omp parallel for + for (index_t b = 0; b < batch_size; ++b) { + for (index_t d = 0; d < input_depth; ++d) { + for (index_t h = 0; h < input_height; ++h) { + const index_t out_h = h / block_size_; + const index_t offset_h = (h % block_size_); + for (index_t w = 0; w < input_width; ++w) { + const index_t out_w = w / block_size_; + const index_t offset_w = (w % block_size_); + const index_t offset_d = + (offset_h * block_size_ + offset_w) * input_depth; + + const index_t out_d = d + offset_d; + const index_t o_index = + ((b * output_depth + out_d) * output_height + out_h) + * output_width + out_w; + const index_t i_index = + ((b * input_depth + d) * input_height + h) * input_width + w; + output_ptr[o_index] = input_ptr[i_index]; + } + } + } + } + + return MACE_SUCCESS; + } + + const int block_size_; +}; + +#ifdef MACE_ENABLE_OPENCL +template +struct SpaceToDepthOpFunctor : OpKernel { + explicit SpaceToDepthOpFunctor(OpKernelContext *context, + const int block_size) + : OpKernel(context), block_size_(block_size) {} + MaceStatus operator()(const Tensor *input, + Tensor *output, + StatsFuture *future); + + const int block_size_; + cl::Kernel kernel_; + uint32_t kwg_size_; + std::unique_ptr kernel_error_; + std::vector input_shape_; +}; +#endif // MACE_ENABLE_OPENCL + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_SPACE_TO_DEPTH_H_ diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h index fa1ed2c6a2534b62795cd3d2c541f722795ff9de..3f56e8d94cb1b447ae5ce0203255e3a59813a34b 100644 --- a/mace/ops/batch_to_space.h +++ b/mace/ops/batch_to_space.h @@ -19,7 +19,7 @@ #include #include "mace/core/operator.h" -#include "mace/kernels/space_to_batch.h" +#include "mace/kernels/batch_to_space.h" namespace mace { namespace ops { @@ -31,8 +31,7 @@ class BatchToSpaceNDOp : public Operator { : Operator(op_def, context), functor_(context, OperatorBase::GetRepeatedArgs("crops", {0, 0, 0, 0}), - OperatorBase::GetRepeatedArgs("block_shape", {1, 1}), - true) {} + OperatorBase::GetRepeatedArgs("block_shape", {1, 1})) {} MaceStatus Run(StatsFuture *future) override { const Tensor *batch_tensor = this->Input(INPUT); @@ -41,7 +40,7 @@ class BatchToSpaceNDOp : public Operator { } private: - kernels::SpaceToBatchFunctor functor_; + kernels::BatchToSpaceFunctor functor_; protected: MACE_OP_INPUT_TAGS(INPUT); diff --git a/mace/ops/depth_to_space.h b/mace/ops/depth_to_space.h index 49183873733cd4d878ad1113f64c76aa918744cd..c2946b849a72ea1215e49c4875887c4cf0b49b0d 100644 --- a/mace/ops/depth_to_space.h +++ b/mace/ops/depth_to_space.h @@ -30,26 +30,13 @@ class DepthToSpaceOp : public Operator { DepthToSpaceOp(const OperatorDef &op_def, OpKernelContext *context) : Operator(op_def, context), block_size_(OperatorBase::GetOptionalArg("block_size", 1)), - functor_(context, this->block_size_, true) {} + functor_(context, this->block_size_) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); Tensor *output = this->Output(OUTPUT); MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); - int input_depth; - if (D == CPU) { - input_depth = input->dim(1); - } else if (D == GPU) { - input_depth = input->dim(3); - } else { - MACE_NOT_IMPLEMENTED; - } - MACE_CHECK(input_depth % (block_size_ * block_size_) == 0, - "input depth should be dividable by block_size * block_size", - input_depth); - MACE_CHECK((input_depth % 4) == 0, - "input channel should be dividable by 4"); return functor_(input, output, future); } diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc index 99c4fb0b6e4bf05a4e2c502731136966cabdd07e..768f7c1a0e8e98a698d6b9e256ffe6da8a4978c4 100644 --- a/mace/ops/depth_to_space_test.cc +++ b/mace/ops/depth_to_space_test.cc @@ -24,21 +24,18 @@ namespace test { namespace { template -void RunDepthToSpace(const bool d2s, - const std::vector &input_shape, +void RunDepthToSpace(const std::vector &input_shape, const std::vector &input_data, const int block_size, const std::vector &expected_shape, const std::vector &expected_data) { OpsTestNet net; net.AddInputFromArray("Input", input_shape, input_data); - const char *ops_name = (d2s) ? "DepthToSpace" : "SpaceToDepth"; - const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest"; // Construct graph if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - OpDefBuilder(ops_name, ops_test_name) + OpDefBuilder("DepthToSpace", "DepthToSpaceTest") .Input("InputNCHW") .Output("OutputNCHW") .AddIntArg("block_size", block_size) @@ -51,7 +48,7 @@ void RunDepthToSpace(const bool d2s, } else { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); - OpDefBuilder(ops_name, ops_test_name) + OpDefBuilder("DepthToSpace", "DepthToSpaceTest") .Input("InputImage") .Output("OutputImage") .AddIntArg("block_size", block_size) @@ -69,47 +66,11 @@ void RunDepthToSpace(const bool d2s, } } // namespace -class SpaceToDepthOpTest : public OpsTestBase {}; - -TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) { - RunDepthToSpace( - false, {1, 2, 4, 4}, - {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, - 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, - 2, {1, 1, 2, 16}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); -} - -TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) { - RunDepthToSpace( - false, {1, 2, 4, 4}, - {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, - 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, - 2, {1, 1, 2, 16}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); -} - -TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) { - RunDepthToSpace( - false, {1, 2, 2, 4}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); -} - -TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) { - RunDepthToSpace( - false, {1, 2, 2, 4}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); -} - class DepthToSpaceOpTest : public OpsTestBase {}; TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) { RunDepthToSpace( - true, {1, 1, 2, 16}, + {1, 1, 2, 16}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 2, {1, 2, 4, 4}, @@ -119,7 +80,7 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) { TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) { RunDepthToSpace( - true, {1, 1, 2, 16}, + {1, 1, 2, 16}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 2, {1, 2, 4, 4}, @@ -129,14 +90,14 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) { RunDepthToSpace( - true, {1, 1, 1, 16}, + {1, 1, 1, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); } TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) { RunDepthToSpace( - true, {1, 1, 1, 16}, + {1, 1, 1, 16}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); } @@ -144,14 +105,13 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) { TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) { const std::vector in = std::vector(192 * 192 * 128, 1.0); - RunDepthToSpace(true, {1, 192, 192, 128}, in, 2, + RunDepthToSpace({1, 192, 192, 128}, in, 2, {1, 384, 384, 32}, in); } namespace { template -void RandomTest(const bool d2s, - const int block_size, +void RandomTest(const int block_size, const std::vector &shape) { testing::internal::LogToStderr(); srand(time(NULL)); @@ -159,14 +119,11 @@ void RandomTest(const bool d2s, // Construct graph OpsTestNet net; - const char *ops_name = (d2s) ? "DepthToSpace" : "SpaceToDepth"; - const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest"; - // Add input data net.AddRandomInput("Input", shape); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - OpDefBuilder(ops_name, ops_test_name) + OpDefBuilder("DepthToSpace", "DepthToSpaceTest") .Input("InputNCHW") .AddIntArg("block_size", block_size) .Output("OutputNCHW") @@ -181,7 +138,7 @@ void RandomTest(const bool d2s, BufferToImage(&net, "Input", "InputImg", kernels::BufferType::IN_OUT_CHANNEL); - OpDefBuilder(ops_name, ops_test_name) + OpDefBuilder("DepthToSpace", "DepthToSpaceTest") .Input("InputImg") .AddIntArg("block_size", block_size) .AddIntArg("T", static_cast(DataTypeToEnum::value)) @@ -205,19 +162,11 @@ void RandomTest(const bool d2s, } // namespace TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) { - RandomTest(true, 2, {1, 192, 192, 128}); + RandomTest(2, {1, 192, 192, 128}); } TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) { - RandomTest(true, 2, {1, 192, 192, 128}); -} - -TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) { - RandomTest(false, 2, {1, 384, 384, 32}); -} - -TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) { - RandomTest(false, 2, {1, 384, 384, 32}); + RandomTest(2, {1, 192, 192, 128}); } } // namespace test diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h index 170bde09b0876edb370f7873f3f9fa09e55d67ce..39b90b1afbb4fd06c0825e9a4274b9f8f4fa23b7 100644 --- a/mace/ops/space_to_batch.h +++ b/mace/ops/space_to_batch.h @@ -31,8 +31,7 @@ class SpaceToBatchNDOp : public Operator { : Operator(op_def, context), functor_(context, OperatorBase::GetRepeatedArgs("paddings", {0, 0, 0, 0}), - OperatorBase::GetRepeatedArgs("block_shape", {1, 1}), - false) {} + OperatorBase::GetRepeatedArgs("block_shape", {1, 1})) {} MaceStatus Run(StatsFuture *future) override { const Tensor *space_tensor = this->Input(INPUT); diff --git a/mace/ops/space_to_depth.h b/mace/ops/space_to_depth.h index 75dd27ed04a4a49a85a7e6c8d760bc0a76c1928b..6d078e2ff8ad0f4568470f3a143a4aefae37f16b 100644 --- a/mace/ops/space_to_depth.h +++ b/mace/ops/space_to_depth.h @@ -19,7 +19,7 @@ #include #include "mace/core/operator.h" -#include "mace/kernels/depth_to_space.h" +#include "mace/kernels/space_to_depth.h" namespace mace { namespace ops { @@ -30,34 +30,12 @@ class SpaceToDepthOp : public Operator { SpaceToDepthOp(const OperatorDef &op_def, OpKernelContext *context) : Operator(op_def, context), functor_(context, - OperatorBase::GetOptionalArg("block_size", 1), - false) {} + OperatorBase::GetOptionalArg("block_size", 1)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); Tensor *output = this->Output(OUTPUT); MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); - const int block_size = OperatorBase::GetOptionalArg("block_size", 1); - index_t input_height; - index_t input_width; - index_t input_depth; - if (D == CPU) { - input_height = input->dim(2); - input_width = input->dim(3); - input_depth = input->dim(1); - } else if (D == GPU) { - input_height = input->dim(1); - input_width = input->dim(2); - input_depth = input->dim(3); - } else { - MACE_NOT_IMPLEMENTED; - } - MACE_CHECK((input_depth % 4) == 0, - "input channel should be dividable by 4"); - MACE_CHECK( - (input_width % block_size == 0) && (input_height % block_size == 0), - "input width and height should be dividable by block_size", - input->dim(3)); return functor_(input, output, future); } @@ -66,7 +44,7 @@ class SpaceToDepthOp : public Operator { MACE_OP_OUTPUT_TAGS(OUTPUT); private: - kernels::DepthToSpaceOpFunctor functor_; + kernels::SpaceToDepthOpFunctor functor_; }; } // namespace ops diff --git a/mace/ops/space_to_depth_test.cc b/mace/ops/space_to_depth_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c1168a650ffdda94502272dec5b56594bc11dc4f --- /dev/null +++ b/mace/ops/space_to_depth_test.cc @@ -0,0 +1,167 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include "mace/core/operator.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +namespace { +template +void RunSpaceToDepth(const std::vector &input_shape, + const std::vector &input_data, + const int block_size, + const std::vector &expected_shape, + const std::vector &expected_data) { + OpsTestNet net; + net.AddInputFromArray("Input", input_shape, input_data); + // Construct graph + if (D == DeviceType::CPU) { + net.TransformDataFormat("Input", NHWC, "InputNCHW", + NCHW); + OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntArg("block_size", block_size) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + net.TransformDataFormat("OutputNCHW", NCHW, + "Output", NHWC); + + } else { + BufferToImage(&net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("block_size", block_size) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } + + if (D == DeviceType::GPU) { + ImageToBuffer(&net, "OutputImage", "Output", + kernels::BufferType::IN_OUT_CHANNEL); + } + auto expected = net.CreateTensor(expected_shape, expected_data); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} +} // namespace + +class SpaceToDepthOpTest : public OpsTestBase {}; + +TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) { + RunSpaceToDepth( + {1, 2, 4, 4}, + {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, + 2, {1, 1, 2, 16}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); +} + +TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) { + RunSpaceToDepth( + {1, 2, 4, 4}, + {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}, + 2, {1, 1, 2, 16}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); +} + +TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) { + RunSpaceToDepth( + {1, 2, 2, 4}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); +} + +TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) { + RunSpaceToDepth( + {1, 2, 2, 4}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); +} + +namespace { +template +void RandomTest(const int block_size, + const std::vector &shape) { + testing::internal::LogToStderr(); + srand(time(NULL)); + + // Construct graph + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", shape); + net.TransformDataFormat("Input", NHWC, "InputNCHW", + NCHW); + OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") + .Input("InputNCHW") + .AddIntArg("block_size", block_size) + .Output("OutputNCHW") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(); + + net.TransformDataFormat("OutputNCHW", NCHW, "Output", + NHWC); + + BufferToImage(&net, "Input", "InputImg", + kernels::BufferType::IN_OUT_CHANNEL); + + OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") + .Input("InputImg") + .AddIntArg("block_size", block_size) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Output("OutputImg") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + ImageToBuffer(&net, "OutputImg", "OPENCLOutput", + kernels::BufferType::IN_OUT_CHANNEL); + + if (DataTypeToEnum::value == DT_FLOAT) { + ExpectTensorNear(*net.GetTensor("Output"), + *net.GetOutput("OPENCLOutput"), 1e-5); + } else { + ExpectTensorNear(*net.GetTensor("Output"), + *net.GetOutput("OPENCLOutput"), 1e-3, 1e-4); + } +} +} // namespace + +TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) { + RandomTest(2, {1, 384, 384, 32}); +} + +TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) { + RandomTest(2, {1, 384, 384, 32}); +} + +} // namespace test +} // namespace ops +} // namespace mace