From a065adb8297df331be1b26da0dfe3921db967cdf Mon Sep 17 00:00:00 2001 From: Unknown Date: Tue, 20 Mar 2018 16:04:24 +0800 Subject: [PATCH] add depth_to_space kernel / test/benchmark --- mace/kernels/depth_to_space.h | 80 ++++++++++++++++++++++++ mace/kernels/opencl/cl/depth_to_space.cl | 52 +++++++++++++++ mace/ops/depth_to_space.h | 62 ++++++------------ mace/ops/depth_to_space_benchmark.cc | 74 ++++++++++++++++++++++ mace/ops/depth_to_space_test.cc | 74 ++++++++++++++++++++++ 5 files changed, 299 insertions(+), 43 deletions(-) create mode 100644 mace/kernels/depth_to_space.h create mode 100644 mace/kernels/opencl/cl/depth_to_space.cl create mode 100644 mace/ops/depth_to_space_benchmark.cc create mode 100644 mace/ops/depth_to_space_test.cc diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h new file mode 100644 index 00000000..12d2fd0c --- /dev/null +++ b/mace/kernels/depth_to_space.h @@ -0,0 +1,80 @@ +// +// Created by liutuo on 18-3-20. +// + +#ifndef MACE_KERNELS_DEPTH_TO_SPACE_H +#define MACE_KERNELS_DEPTH_TO_SPACE_H + +#include "mace/core/future.h" +#include "mace/core/tensor.h" + +namespace mace { +namespace kernels { + +template +struct DepthToSpaceOpFunctor { + DepthToSpaceOpFunctor(const int block_size) : block_size_(block_size) {} + void operator()(const Tensor *input, + Tensor *output, + StatsFuture *future) { + std::vector output_shape(input->shape()); + const int batch_size = input->dim(0); + const int input_height = input->dim(1); + const int input_width = input->dim(2); + const int input_depth = input->dim(3); + + const int block_size_sq = block_size_ * block_size_; + + const index_t output_depth = input_depth / block_size_sq; + const index_t output_width = input_width * block_size_; + const index_t output_height = input_height * block_size_; + output_shape[0] = batch_size; + output_shape[1] = output_height; + output_shape[2] = output_width; + output_shape[3] = output_depth; + + output->Resize(output_shape); + + Tensor::MappingGuard logits_guard(input); + Tensor::MappingGuard output_guard(output); + const T *input_ptr = input->data(); + T *output_ptr = output->mutable_data(); + +#pragma omp parallel for + for (int b = 0; b < batch_size; ++b) { + for (int h = 0; h < output_height; ++h) { + const int in_h = h / block_size_; + const int offset_h = (h % block_size_); + for (int w = 0; w < output_width; ++w) { + const int in_w = w / block_size_; + const int offset_w = w % block_size_; + const int offset_d = (offset_h * block_size_ + offset_w) * output_depth; + for (int d = 0; d < output_depth; ++d) { + const int in_d = d + offset_d; + const int o_index = ((b * output_height + h) * output_width + w) * output_depth + d; + const int i_index = ((b * input_height + in_h) * input_width + in_w) * input_depth + in_d; + output_ptr[o_index] = input[i_index]; + } + } + } + } + + } + const int block_size_; +}; + +template +struct DepthToSpaceOpFunctor { + + DepthToSpaceOpFunctor(const int block_size) : block_size_(block_size) {} + void operator()(const Tensor *input, Tensor *output, StatsFuture *future); + + cl::Kernel kernel_; + const int block_size_; + std::vector input_shape_; +}; + +} // namespace kernels +} // namespace mace + +#endif //MACE_KERNELS_DEPTH_TO_SPACE_H diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl new file mode 100644 index 00000000..2a193a23 --- /dev/null +++ b/mace/kernels/opencl/cl/depth_to_space.cl @@ -0,0 +1,52 @@ +#include + +// assume channes_per_group mod 4 = 0 && groups mod 4 == 0 +__kernel void channel_shuffle(__read_only image2d_t input, + __private const int groups, + __private const int channels_per_group, + __write_only image2d_t output) { + const int group_chan_blk_idx = get_global_id(0); + const int width_idx = get_global_id(1); + const int width = get_global_size(1); + const int hb_idx = get_global_id(2); + const int group_blks = groups / 4; + const int groups_blks_width = group_blks * width; + const int channels_per_group_blks = channels_per_group / 4; + const int channels_per_group_blks_width = channels_per_group_blks * width; + + DATA_TYPE4 in_chan_data0, in_chan_data1, in_chan_data2, in_chan_data3; + DATA_TYPE4 out_chan_data0, out_chan_data1, out_chan_data2, out_chan_data3; + + int in_x = mad24(group_chan_blk_idx, width, width_idx); + for (short g_blk = 0; g_blk < group_blks; ++g_blk) { + // fetch 4 groups, for each group fetch 4 channels + in_chan_data0 = READ_IMAGET(input, SAMPLER, (int2)(in_x, hb_idx)); + in_x += channels_per_group_blks_width; + + in_chan_data1 = READ_IMAGET(input, SAMPLER, (int2)(in_x, hb_idx)); + in_x += channels_per_group_blks_width; + + in_chan_data2 = READ_IMAGET(input, SAMPLER, (int2)(in_x, hb_idx)); + in_x += channels_per_group_blks_width; + + in_chan_data3 = READ_IMAGET(input, SAMPLER, (int2)(in_x, hb_idx)); + in_x += channels_per_group_blks_width; + + out_chan_data0 = (DATA_TYPE4)(in_chan_data0.x, in_chan_data1.x, in_chan_data2.x, in_chan_data3.x); + out_chan_data1 = (DATA_TYPE4)(in_chan_data0.y, in_chan_data1.y, in_chan_data2.y, in_chan_data3.y); + out_chan_data2 = (DATA_TYPE4)(in_chan_data0.z, in_chan_data1.z, in_chan_data2.z, in_chan_data3.z); + out_chan_data3 = (DATA_TYPE4)(in_chan_data0.w, in_chan_data1.w, in_chan_data2.w, in_chan_data3.w); + + int out_x = mad24(mad24(group_chan_blk_idx, groups, g_blk), width, width_idx); + WRITE_IMAGET(output, (int2)(out_x, hb_idx), out_chan_data0); + out_x += groups_blks_width; + + WRITE_IMAGET(output, (int2)(out_x, hb_idx), out_chan_data1); + out_x += groups_blks_width; + + WRITE_IMAGET(output, (int2)(out_x, hb_idx), out_chan_data2); + out_x += groups_blks_width; + + WRITE_IMAGET(output, (int2)(out_x, hb_idx), out_chan_data3); + } +} diff --git a/mace/ops/depth_to_space.h b/mace/ops/depth_to_space.h index fe0aee92..e2cd5fb9 100644 --- a/mace/ops/depth_to_space.h +++ b/mace/ops/depth_to_space.h @@ -16,58 +16,34 @@ namespace ops { template class DepthToSpaceOp : public Operator { - public: + public: DepthToSpaceOp(const OperatorDef &op_def, Workspace *ws) : Operator(op_def, ws), - functor_(OperatorBase::GetRepeatedArgument("crops", {0, 0, 0, 0}), - OperatorBase::GetSingleArgument("block_size", 1), - true) {} + block_size_(OperatorBase::GetSingleArgument("block_size", 1)), + functor_(this->block_size_) {} bool Run(StatsFuture *future) override { - const Tensor *batch_tensor = this->Input(INPUT); - Tensor *space_tensor = this->Output(OUTPUT); + const Tensor *input = this->Input(INPUT); + Tensor *output = this->Output(OUTPUT); + MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); - std::vector output_shape(4, 0); - CalculateOutputShape(batch_tensor, space_tensor, output_shape.data()); - functor_(space_tensor, output_shape, const_cast(batch_tensor), - future); - return true; - } - - private: - inline void CalculateOutputShape(const Tensor *input_tensor, - Tensor *output, - index_t *output_shape) { - auto crops = OperatorBase::GetRepeatedArgument("crops", {0, 0, 0, 0}); - auto block_shape = - OperatorBase::GetRepeatedArgument("block_shape", {1, 1}); - MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D"); - MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D"); - MACE_CHECK(crops.size() == 4, "Crops' shape should be 2D"); + int input_depth = input->dim(3); + MACE_CHECK(input_depth % (block_size_ * block_size_) == 0, + "input depth should be dividable by block_size * block_size", + input->dim(3)); - const index_t block_dims = block_shape.size(); - index_t block_shape_product = 1; - for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) { - MACE_CHECK(block_shape[block_dim] > 1, - "block_shape's value should be great to 1"); - const index_t block_shape_value = block_shape[block_dim]; - const index_t cropped_input_size = - input_tensor->dim(block_dim + 1) * block_shape_value - - crops[block_dim * 2] - crops[block_dim * 2 + 1]; - MACE_CHECK(cropped_input_size >= 0, "cropped size must be non-negative"); - block_shape_product *= block_shape_value; - output_shape[block_dim + 1] = cropped_input_size; - } - output_shape[0] = input_tensor->dim(0) / block_shape_product; - output_shape[3] = input_tensor->dim(3); + functor_(input, output, future); + return true; } - private: - kernels::DepthToSpaceOpFunctor functor_; + private: + kernels::DepthToSpaceOpFunctor functor_; + + protected: + const int block_size_; + OP_INPUT_TAGS(INPUT); + OP_OUTPUT_TAGS(OUTPUT); - protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); }; } // namespace ops diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc new file mode 100644 index 00000000..a3356b96 --- /dev/null +++ b/mace/ops/depth_to_space_benchmark.cc @@ -0,0 +1,74 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/operator.h" +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +template +static void DepthToSpace( + int iters, int batch, int channels, int height, int width, int block_size) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", {batch, height, width, channels}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + + OpDefBuilder("DepthToSpace", "DepthToSpaceBM") + .Input("InputImage") + .Output("Output") + .AddIntArg("block_size", block_size) + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("DepthToSpace", "DepthToSpaceBM") + .Input("Input") + .Output("Output") + .Finalize(net.NewOperatorDef()); + } + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} + +#define BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE) \ + static void \ + BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + DepthToSpace(iters, N, C, H, W, G); \ + } \ + BENCHMARK(BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE) + +#define BM_DEPTH_TO_SPACE(N, C, H, W, G) \ + BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \ + BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, OPENCL); \ + BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, OPENCL); + +BM_DEPTH_TO_SPACE(1, 64, 64, 64, 8); +BM_DEPTH_TO_SPACE(1, 64, 128, 128, 8); +BM_DEPTH_TO_SPACE(1, 64, 256, 256, 8); + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc new file mode 100644 index 00000000..7cf3d9f3 --- /dev/null +++ b/mace/ops/depth_to_space_test.cc @@ -0,0 +1,74 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/operator.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class DepthToSpaceOpTest : public OpsTestBase {}; + +TEST_F(DepthToSpaceOpTest, C8G4_CPU) { + // Construct graph + OpsTestNet net; + OpDefBuilder("DepthToSpace", "DepthToSpaceTest") + .Input("Input") + .Output("Output") + .AddIntArg("block_size", 1) + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddInputFromArray( + "Input", {1, 1, 2, 8}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + + // Run + net.RunOp(); + + // Check + auto expected = CreateTensor( + {1, 1, 2, 8}, {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); +} + +TEST_F(DepthToSpaceOpTest, C16G4_OPENCL) { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 1, 2, 16}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + + OpDefBuilder("DepthToSpace", "DepthToSpaceTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("block_size", 1) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(DeviceType::OPENCL); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", + kernels::BufferType::IN_OUT_CHANNEL); + + // Check + auto expected = CreateTensor( + {1, 1, 2, 16}, + {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, + 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); +} + +} // namespace test +} // namespace ops +} // namespace mace -- GitLab