diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index ab557c91e3101234585c85ba1b01d4b87060134c..76f3c80e2d5ca7b09b4d89a1c4d72b5e17023910 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -108,11 +108,11 @@ struct DepthToSpaceOpFunctor { : block_size_(block_size), d2s_(d2s) {} void operator()(const Tensor *input, Tensor *output, StatsFuture *future); + const int block_size_; + bool d2s_; cl::Kernel kernel_; uint32_t kwg_size_; std::unique_ptr kernel_error_; - const int block_size_; - bool d2s_; std::vector input_shape_; }; diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc index d2ea128afcf69eee459cb9d2a92f0bee06e85163..bd7e723d08fa41e7691a78e71f849c3267b9fd5f 100644 --- a/mace/kernels/opencl/depth_to_space_opencl.cc +++ b/mace/kernels/opencl/depth_to_space_opencl.cc @@ -22,16 +22,31 @@ void DepthToSpaceOpFunctor::operator()( const char *kernel_name = nullptr; + uint32_t gws[3]; + std::stringstream ss; index_t output_height, output_width, output_depth; - if (d2s_) { output_height = input_height * block_size_; + if (d2s_) { + output_height = input_height * block_size_; output_width = input_width * block_size_; output_depth = input_depth / (block_size_ * block_size_); kernel_name = "depth_to_space"; + + gws[0] = static_cast(RoundUpDiv4(output_depth)); + gws[1] = static_cast(output_width); + gws[2] = static_cast(output_height * batch); + ss << "depth_to_space_opencl_kernel_" << batch << "_" + << output_height << "_" << output_width << "_" << output_depth; } else { output_height = input_height / block_size_; output_width = input_width / block_size_; output_depth = input_depth * block_size_ * block_size_; kernel_name = "space_to_depth"; + + gws[0] = static_cast(RoundUpDiv4(input_depth)); + gws[1] = static_cast(input_width); + gws[2] = static_cast(input_height * batch); + ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_" + << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3); } const index_t input_depth_blocks = RoundUpDiv4(input_depth); const index_t output_depth_blocks = RoundUpDiv4(output_depth); @@ -73,23 +88,7 @@ void DepthToSpaceOpFunctor::operator()( static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - uint32_t gws[3]; - std::stringstream ss; if (!IsVecEqual(input_shape_, input->shape())) { - if (d2s_) { - gws[0] = static_cast(output_depth_blocks); - gws[1] = static_cast(output_width); - gws[2] = static_cast(output_height * batch); - ss << "depth_to_space_opencl_kernel_" << output->dim(0) << "_" - << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - } else { - gws[0] = static_cast(input_depth_blocks); - gws[1] = static_cast(input_width); - gws[2] = static_cast(input_height * batch); - ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_" - << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3); - } - uint32_t idx = 0; if (runtime->IsOutOfRangeCheckEnabled()) { kernel_.setArg(idx++,