From 241b8fd8180b4acd05230ddd785532bd3ed73f47 Mon Sep 17 00:00:00 2001 From: liuqi Date: Wed, 8 Nov 2017 09:49:58 +0800 Subject: [PATCH] Finish batch to space and reverse op. --- mace/kernels/opencl/cl/space_to_batch.cl | 36 ++- mace/kernels/opencl/conv_2d_opencl.cc | 19 +- mace/kernels/opencl/conv_2d_opencl_1x1.cc | 5 - mace/kernels/opencl/conv_2d_opencl_3x3.cc | 74 +------ ...ace_to_batch.h => space_to_batch_opecl.cc} | 31 ++- mace/kernels/space_to_batch.h | 40 ++++ mace/ops/BUILD | 7 +- mace/ops/batch_to_space.cc | 13 ++ mace/ops/batch_to_space.h | 76 +++++++ mace/ops/batch_to_space_benchmark.cc | 56 +++++ mace/ops/conv_atrous_2d_test.cc | 208 ------------------ mace/ops/space_to_batch.cc | 13 ++ mace/ops/space_to_batch.h | 76 +++++++ mace/ops/space_to_batch_benchmark.cc | 56 +++++ mace/ops/space_to_batch_test.cc | 163 +++++++++++--- 15 files changed, 514 insertions(+), 359 deletions(-) rename mace/kernels/opencl/{space_to_batch.h => space_to_batch_opecl.cc} (57%) create mode 100644 mace/kernels/space_to_batch.h create mode 100644 mace/ops/batch_to_space.cc create mode 100644 mace/ops/batch_to_space.h create mode 100644 mace/ops/batch_to_space_benchmark.cc delete mode 100644 mace/ops/conv_atrous_2d_test.cc create mode 100644 mace/ops/space_to_batch.cc create mode 100644 mace/ops/space_to_batch.h create mode 100644 mace/ops/space_to_batch_benchmark.cc diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl index fa432dd8..24a07712 100644 --- a/mace/kernels/opencl/cl/space_to_batch.cl +++ b/mace/kernels/opencl/cl/space_to_batch.cl @@ -1,39 +1,53 @@ void kernel space_to_batch(global float* space_data_ptr, + global const int* block_shape_ptr, + global const int* paddings_ptr, private const int space_batch, private const int space_channel, private const int space_height, private const int space_width, - private const int block_height, - private const int block_width, + private const int batch_height, + private const int batch_width, private const int b2s, global float* batch_data_ptr) { int batch_idx = get_global_id(0); int batch_channel_idx = get_global_id(1); int batch_pixel_idx = get_global_id(2); - const int batch_height = space_height / block_height; - const int batch_width = space_width / block_width; + const int block_height = block_shape_ptr[0]; + const int block_width = block_shape_ptr[1]; + const int padding_height_start = paddings_ptr[0]; + const int padding_width_start = paddings_ptr[2]; + const int batch_pixel_height_idx = batch_pixel_idx / batch_width; const int batch_pixel_width_idx = batch_pixel_idx % batch_width; const int block_size = block_height * block_width; const int space_idx = batch_idx / block_size; const int remaining_batch_idx = batch_idx % block_size; - const int space_pixel_height_idx = (remaining_batch_idx / block_width) + - batch_pixel_height_idx * block_height; - const int space_pixel_width_idx = (remaining_batch_idx % block_width) + - batch_pixel_width_idx * block_width; + int space_pixel_height_idx = (remaining_batch_idx / block_width) + + batch_pixel_height_idx * block_height; + int space_pixel_width_idx = (remaining_batch_idx % block_width) + + batch_pixel_width_idx * block_width; const int batch_data_offset = batch_idx * (space_channel * batch_height * batch_width) + (batch_channel_idx * batch_height * batch_width) + batch_pixel_height_idx * batch_width + batch_pixel_width_idx; + space_pixel_height_idx -= padding_height_start; + space_pixel_width_idx -= padding_width_start; const int space_data_offset = space_idx * (space_channel * space_height * space_width) + (batch_channel_idx * space_height * space_width) + space_pixel_height_idx * space_width + space_pixel_width_idx; - if (b2s) { - *(space_data_ptr + space_data_offset) = *(batch_data_ptr + batch_data_offset); + if (space_pixel_height_idx < 0 || space_pixel_height_idx >= space_height || + space_pixel_width_idx < 0 || space_pixel_width_idx >= space_width) { + if (!b2s) { + *(batch_data_ptr + batch_data_offset) = 0; + } } else { - *(batch_data_ptr + batch_data_offset) = *(space_data_ptr + space_data_offset); + if (b2s) { + *(space_data_ptr + space_data_offset) = *(batch_data_ptr + batch_data_offset); + } else { + *(batch_data_ptr + batch_data_offset) = *(space_data_ptr + space_data_offset); + } } } diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 11099d9d..fcdb3de2 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -8,24 +8,20 @@ namespace mace { namespace kernels { extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int dilation_height, - const int dilation_width, Tensor *output); + const Tensor *bias, Tensor *output); extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int dilation_height, - const int dilation_width, Tensor *output); + const Tensor *bias, Tensor *output); extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int dilation_height, - const int dilation_width, Tensor *output); + const Tensor *bias, Tensor *output); template <> void Conv2dFunctor::operator()(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output) { typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int dilation_height, - const int dilation_width, Tensor *output); + const Tensor *bias, Tensor *output); // Selection matrix: kernel_size x stride_size static const Conv2dOpenclFunction selector[5][2] = { {Conv2dOpenclK1x1S1, nullptr}, @@ -37,7 +33,8 @@ void Conv2dFunctor::operator()(const Tensor *input, index_t kernel_h = filter->shape()[2]; index_t kernel_w = filter->shape()[3]; if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || - strides_[0] > 2 || selector[kernel_h - 1][strides_[0] - 1] == nullptr) { + strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || + selector[kernel_h - 1][strides_[0] - 1] == nullptr) { LOG(WARNING) << "OpenCL conv2d kernel with " << "filter" << kernel_h << "x" << kernel_w << "," << " stride " << strides_[0] << "x" << strides_[1] @@ -53,9 +50,9 @@ void Conv2dFunctor::operator()(const Tensor *input, Tensor::MappingGuard input_mapper(input); ConstructInputWithPadding(input->data(), input->shape().data(), paddings_.data(), &padded_input); - conv2d_func(&padded_input, filter, bias, dilations_[0], dilations_[1], output); + conv2d_func(&padded_input, filter, bias, output); }else { - conv2d_func(input, filter, bias, dilations_[0], dilations_[1], output); + conv2d_func(input, filter, bias, output); } } diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 424f2a95..130ca4b7 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -7,7 +7,6 @@ #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/utils/utils.h" -#include "mace/core/macros.h" namespace mace { namespace kernels { @@ -174,11 +173,7 @@ void Conv1x1V3(const Tensor *input, extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, const Tensor *bias, - const int dilation_height, - const int dilation_width, Tensor *output) { - MACE_UNUSED(dilation_height); - MACE_UNUSED(dilation_width); const index_t batch = output->shape()[0]; const index_t height = output->shape()[2]; const index_t width = output->shape()[3]; diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 4738d165..41dccf4c 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -3,19 +3,14 @@ // #include "mace/core/common.h" -#include "mace/core/macros.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/conv_2d.h" -#include "mace/kernels/opencl/space_to_batch.h" namespace mace { namespace kernels { - static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter, - const Tensor *bias, const uint32_t stride, - Tensor *output, const std::vector *waiting_events, - cl::Event *ret_event) { + const Tensor *bias, const uint32_t stride, Tensor *output) { const index_t channels = output->shape()[1]; const index_t height = output->shape()[2]; const index_t width = output->shape()[3]; @@ -51,75 +46,18 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter, cl_int error = runtime->command_queue().enqueueNDRangeKernel( conv_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), - waiting_events, - ret_event); + cl::NDRange(lws[0], lws[1], lws[2])); MACE_CHECK(error == CL_SUCCESS); } -static void CalOutputShape(const std::vector &input_shape, - const std::vector &filter_shape, - const int dilation_height, - const int dilation_width, - std::vector &output_shape) { - index_t kernel_height = filter_shape[2]; - index_t kernel_width = filter_shape[3]; - index_t output_channels = filter_shape[0]; - - index_t k_extent_height = (kernel_height - 1) * dilation_height + 1; - index_t k_extent_width = (kernel_width - 1) * dilation_width + 1; - index_t output_height = input_shape[2] - k_extent_height + 1; - index_t output_width = input_shape[3] - k_extent_width + 1; - output_shape[0] = input_shape[0]; - output_shape[1] = output_channels; - output_shape[2] = output_height; - output_shape[3] = output_width; -} -static void ResizeBatchTensor(const std::vector &input_shape, - const int dilation_height, - const int dilation_width, - Tensor *batch_tensor) { - LOG(INFO) << input_shape[2] << "\t" << input_shape[3] << "\t" <Resize({input_shape[0] * dilation_height * dilation_width, - input_shape[1], - input_shape[2] / dilation_height, - input_shape[3] / dilation_width} - ); - LOG(INFO) << batch_tensor->dim(2) << "\t" << batch_tensor->dim(3) << "\t" < 1 && dilation_width > 1) { - cl::Event events[2]; - - Tensor reshaped_input_tensor(GetDeviceAllocator(DeviceType::OPENCL), input->dtype()); - ResizeBatchTensor(input->shape(), dilation_height, dilation_width, &reshaped_input_tensor); - SpaceToBatch(const_cast(input), dilation_height, dilation_width, - &reshaped_input_tensor, nullptr, &events[0]); - Tensor reshaped_output_tensor(GetDeviceAllocator(DeviceType::OPENCL), input->dtype()); - std::vector reshaped_output_shape(4, 0); - CalOutputShape(reshaped_input_tensor.shape(), filter->shape(), - dilation_height, dilation_width, reshaped_output_shape); - reshaped_output_tensor.Resize(reshaped_output_shape); - std::vector s2b_events(1, events[0]); - InnerConv2dK3x3S12(&reshaped_input_tensor, filter, bias, 1, &reshaped_output_tensor, - &s2b_events, &events[1]); - std::vector conv_events(1, events[1]); - SpaceToBatch(&reshaped_output_tensor, dilation_height, dilation_width, - output, &conv_events, nullptr); - } else { - InnerConv2dK3x3S12(input, filter, bias, 1, output, nullptr, nullptr); - } + const Tensor *bias, Tensor *output) { + InnerConv2dK3x3S12(input, filter, bias, 1, output); }; void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int dilation_height, - const int dilation_width, Tensor *output) { - MACE_UNUSED(dilation_height); - MACE_UNUSED(dilation_width); - InnerConv2dK3x3S12(input, filter, bias, 2, output, nullptr, nullptr); + const Tensor *bias, Tensor *output) { + InnerConv2dK3x3S12(input, filter, bias, 2, output); }; } // namespace kernels diff --git a/mace/kernels/opencl/space_to_batch.h b/mace/kernels/opencl/space_to_batch_opecl.cc similarity index 57% rename from mace/kernels/opencl/space_to_batch.h rename to mace/kernels/opencl/space_to_batch_opecl.cc index 0dd393b8..84601492 100644 --- a/mace/kernels/opencl/space_to_batch.h +++ b/mace/kernels/opencl/space_to_batch_opecl.cc @@ -7,32 +7,33 @@ #include "mace/core/common.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/core/tensor.h" +#include "mace/kernels/space_to_batch.h" namespace mace { namespace kernels { -template -void SpaceToBatch(Tensor *space_tensor, - const int block_height, - const int block_width, - Tensor *batch_tensor, - const std::vector *waiting_events, - cl::Event *event) { +template <> +void SpaceToBatchFunctor::operator()(Tensor *space_tensor, + const Tensor *block_shape_tensor, + const Tensor *paddings_tensor, + Tensor *batch_tensor) { auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); auto s2b_kernel = cl::Kernel(program, "space_to_batch"); + uint32_t idx = 0; - s2b_kernel.setArg(idx++, *(static_cast(space_tensor->buffer()))); + s2b_kernel.setArg(idx++, *(static_cast(space_tensor->buffer()))); + s2b_kernel.setArg(idx++, *(static_cast(block_shape_tensor->buffer()))); + s2b_kernel.setArg(idx++, *(static_cast(paddings_tensor->buffer()))); s2b_kernel.setArg(idx++, static_cast(space_tensor->dim(0))); s2b_kernel.setArg(idx++, static_cast(space_tensor->dim(1))); s2b_kernel.setArg(idx++, static_cast(space_tensor->dim(2))); s2b_kernel.setArg(idx++, static_cast(space_tensor->dim(3))); - s2b_kernel.setArg(idx++, block_height); - s2b_kernel.setArg(idx++, block_width); - s2b_kernel.setArg(idx++, static_cast(B2S)); - s2b_kernel.setArg(idx++, *(static_cast(batch_tensor->buffer()))); + s2b_kernel.setArg(idx++, static_cast(batch_tensor->dim(2))); + s2b_kernel.setArg(idx++, static_cast(batch_tensor->dim(3))); + s2b_kernel.setArg(idx++, static_cast(b2s_)); + s2b_kernel.setArg(idx++, *(static_cast(batch_tensor->buffer()))); const uint32_t gws[3] = {static_cast(batch_tensor->dim(0)), static_cast(batch_tensor->dim(1)), @@ -43,9 +44,7 @@ void SpaceToBatch(Tensor *space_tensor, cl_int error = runtime->command_queue().enqueueNDRangeKernel( s2b_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), - waiting_events, - event); + cl::NDRange(lws[0], lws[1], lws[2])); MACE_CHECK(error == CL_SUCCESS); } diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h new file mode 100644 index 00000000..06abc593 --- /dev/null +++ b/mace/kernels/space_to_batch.h @@ -0,0 +1,40 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_CONV_2D_H_ +#define MACE_KERNELS_CONV_2D_H_ + +#include "mace/core/tensor.h" +#include "mace/proto/mace.pb.h" + +namespace mace { +namespace kernels { + +template +struct SpaceToBatchFunctor { + SpaceToBatchFunctor(const bool b2s = false): b2s_(b2s){} + + void operator()(Tensor *input_tensor, + const Tensor *block_shape_tensor, + const Tensor *paddings_tensor, + Tensor *output_tensor) { + MACE_CHECK_NOTNULL(input_tensor); + MACE_CHECK_NOTNULL(block_shape_tensor); + MACE_CHECK_NOTNULL(paddings_tensor); + MACE_CHECK_NOTNULL(output_tensor); + } + + bool b2s_; +}; + +template <> +void SpaceToBatchFunctor::operator()(Tensor *input_tensor, + const Tensor *block_shape_tensor, + const Tensor *paddings_tensor, + Tensor *output); + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_CONV_2D_H_ diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 0f5bdbaa..581a4b96 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -67,12 +67,11 @@ cc_test( testonly = 1, srcs = glob(["space_to_batch_test.cc"]), copts = ["-std=c++11"], - linkopts = if_android(["-pie"]), + linkopts = ["-fopenmp"] + if_android(["-ldl"]), linkstatic = 1, deps = [ - "//mace/kernels", - "//mace/core", - "//mace/ops:test", + ":ops", + ":test", "@gtest//:gtest_main", ], ) diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc new file mode 100644 index 00000000..9b13e13a --- /dev/null +++ b/mace/ops/batch_to_space.cc @@ -0,0 +1,13 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/batch_to_space.h" + +namespace mace { + +REGISTER_CPU_OPERATOR(BatchToSpaceND, BatchToSpaceNDOp); + +REGISTER_OPENCL_OPERATOR(BatchToSpaceND, BatchToSpaceNDOp); + +} // namespace mace diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h new file mode 100644 index 00000000..d1080f37 --- /dev/null +++ b/mace/ops/batch_to_space.h @@ -0,0 +1,76 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_OPS_SPACE_TO_BATCH_H_ +#define MACE_OPS_SPACE_TO_BATCH_H_ + +#include + +#include "mace/core/operator.h" +#include "mace/kernels/space_to_batch.h" + +namespace mace { + +static void BatchToSpaceHelper(const Tensor *input_tensor, + const Tensor *block_shape_tensor, + const Tensor *cropped_tensor, + Tensor *output) { + MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D"); + MACE_CHECK(block_shape_tensor->dim_size() == 1, "Block's shape should be 1D"); + MACE_CHECK(cropped_tensor->dim_size() == 2, "Paddings' shape should be 2D"); + + const index_t block_dims = block_shape_tensor->dim(0); + MACE_CHECK(block_dims == cropped_tensor->dim(0) && 2 == cropped_tensor->dim(1)); + Tensor::MappingGuard block_shape_tensor_mapper(block_shape_tensor); + Tensor::MappingGuard cropped_tensor_mapper(cropped_tensor); + const int *block_shape_ptr = block_shape_tensor->data(); + const int *cropped_ptr = cropped_tensor->data(); + std::vector output_shape(4, 0); + index_t block_shape_product = 1; + for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) { + MACE_CHECK(block_shape_ptr[block_dim] > 1, "block_shape's value should be great to 1"); + const index_t block_shape_value = block_shape_ptr[block_dim]; + const index_t cropped_input_size = input_tensor->dim(block_dim + 2) * block_shape_value + - *cropped_ptr + - *(cropped_ptr+1); + MACE_CHECK(cropped_input_size >= 0, + "cropped size must be non-negative"); + block_shape_product *= block_shape_value; + output_shape[block_dim+2] = cropped_input_size; + cropped_ptr += 2; + } + output_shape[0] = input_tensor->dim(0) / block_shape_product; + output_shape[1] = input_tensor->dim(1); + + output->Resize(output_shape); +} + +template +class BatchToSpaceNDOp: public Operator { + public: + BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws) + : Operator(op_def, ws), functor_(true) {} + + bool Run() override { + const Tensor *input_tensor = this->Input(INPUT); + const Tensor *block_shape_tensor = this->Input(BLOCK_SHAPE); + const Tensor *cropped_tensor = this->Input(CROPS); + Tensor *output = this->Output(OUTPUT); + + BatchToSpaceHelper(input_tensor, block_shape_tensor, cropped_tensor, output); + functor_(output, block_shape_tensor, cropped_tensor, const_cast(input_tensor)); + return true; + } + + private: + kernels::SpaceToBatchFunctor functor_; + + protected: + OP_INPUT_TAGS(INPUT, BLOCK_SHAPE, CROPS); + OP_OUTPUT_TAGS(OUTPUT); +}; + +} // namespace mace + +#endif // MACE_OPS_SPACE_TO_BATCH_H_ diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc new file mode 100644 index 00000000..89e100f1 --- /dev/null +++ b/mace/ops/batch_to_space_benchmark.cc @@ -0,0 +1,56 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/operator.h" +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +template +static void BMBatchToSpace( + int iters, int batch, int channels, int height, int width) { + mace::testing::StopTiming(); + + OpsTestNet net; + OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") + .Input("Input") + .Input("BlockShape") + .Input("Crops") + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddInputFromArray( + "BlockShape", {2}, {2, 2}); + net.AddInputFromArray("Crops", {2, 2}, {0,1,0,1}); + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} + +#define BM_BATCH_TO_SPACE_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void BM_BATCH_TO_SPACE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BMBatchToSpace(iters, N, C, H, W); \ + } \ + BENCHMARK(BM_BATCH_TO_SPACE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + +#define BM_BATCH_TO_SPACE(N, C, H, W, TYPE) \ + BM_BATCH_TO_SPACE_MACRO(N, C, H, W, TYPE, OPENCL); + +BM_BATCH_TO_SPACE(128, 128, 8, 8, float); +} // namespace mace \ No newline at end of file diff --git a/mace/ops/conv_atrous_2d_test.cc b/mace/ops/conv_atrous_2d_test.cc deleted file mode 100644 index 6d4a23a3..00000000 --- a/mace/ops/conv_atrous_2d_test.cc +++ /dev/null @@ -1,208 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include "mace/ops/ops_test_util.h" -#include "mace/kernels/conv_pool_2d_util.h" - -using namespace mace; - -class AtrousConv2dOpTest : public OpsTestBase {}; - -static void UpSampleFilter(const std::vector &filter_shape, - const std::vector &filter_data, - const int dilation_rate, - std::vector &upsampled_filter_shape, - std::vector &upsampled_filter_data) { - upsampled_filter_shape[0] = filter_shape[0]; - upsampled_filter_shape[1] = filter_shape[1]; - upsampled_filter_shape[2] = filter_shape[2] + (filter_shape[2] - 1) * (dilation_rate - 1); - upsampled_filter_shape[3] = filter_shape[3] + (filter_shape[3] - 1) * (dilation_rate - 1); - const index_t upsampled_filter_size = std::accumulate(upsampled_filter_shape.begin(), - upsampled_filter_shape.end(), - 1, std::multiplies()); - upsampled_filter_data.resize(upsampled_filter_size, 0); - index_t filter_idx = 0; - index_t upsampled_filter_idx = 0; - for (index_t n = 0; n < filter_shape[0]; ++n) { - for (index_t c = 0; c < filter_shape[1]; ++c) { - for (index_t h = 0; h < filter_shape[2]; ++h) { - for (index_t w = 0; w < filter_shape[3]; ++w) { - upsampled_filter_data[upsampled_filter_idx] = filter_data[filter_idx]; - filter_idx += 1; - upsampled_filter_idx += dilation_rate; - } - upsampled_filter_idx += 1 - dilation_rate + (dilation_rate-1) * upsampled_filter_shape[3]; - } - upsampled_filter_idx -= (dilation_rate-1) * upsampled_filter_shape[3]; - } - } -} - -template -static void RunConv2D(const std::vector &input_shape, - const std::vector &input_data, - const std::vector &filter_shape, - const std::vector &filter_data, - const std::vector &bias_shape, - const std::vector &bias_data, - const int dilation_h, - const int dilation_w, - Padding padding, - Tensor *result) { - OpsTestNet net; - OpDefBuilder("Conv2D", "Conv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {dilation_h, dilation_w}) - .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddInputFromArray( - "Input", input_shape, input_data); - net.AddInputFromArray( - "Filter", filter_shape, filter_data); - net.AddInputFromArray("Bias", bias_shape, bias_data); - - // Run - net.RunOp(D); - - // Check - result->Copy(*net.GetOutput("Output")); -} - -template -static void GenerateAndRunConv2D(const index_t batch, - const index_t input_channels, - const index_t height, - const index_t width, - const index_t output_channels, - const index_t kernel_h, - const index_t kernel_w, - Padding padding, - const int dilation_rate) { - srand(time(NULL)); - // Add input data - std::vector input_shape = {batch, input_channels, height, width}; - std::vector input_data; - GenerateRandomRealTypeData(input_shape, input_data); - std::vector filter_shape = {output_channels, input_channels, kernel_h, kernel_w}; - std::vector filter_data; - GenerateRandomRealTypeData(filter_shape, filter_data); - std::vector bias_shape = {output_channels}; - std::vector bias_data; - GenerateRandomRealTypeData(bias_shape, bias_data); - - std::vector upsampled_filter_shape(4, 0); - std::vector upsampled_filter_data; - UpSampleFilter(filter_shape, filter_data, dilation_rate, - upsampled_filter_shape, upsampled_filter_data); - Tensor expected_result; - // Run on cpu - RunConv2D(input_shape, input_data, - upsampled_filter_shape, upsampled_filter_data, - bias_shape, bias_data, - 1, 1, - padding, &expected_result); - - Tensor device_result(GetDeviceAllocator(D), DataTypeToEnum::v()); - // run on device - RunConv2D(input_shape, input_data, - filter_shape, filter_data, - bias_shape, bias_data, - dilation_rate, dilation_rate, - padding, &device_result); - ExpectTensorNear(expected_result, device_result, 0.001); -} -template -static void TestSimple(const int kernel_h, - const int kernel_w, - Padding padding, - const int dilation_rate) { - GenerateAndRunConv2D(1, 3, 5, 5, 1, kernel_h, kernel_w, padding, dilation_rate); -} - -TEST_F(AtrousConv2dOpTest, CPUSimple) { - for (int i = 2 ; i < 4; ++i) { - TestSimple(3, 3, VALID, i); - TestSimple(3, 3, SAME, i); - } -} - -TEST_F(AtrousConv2dOpTest, OPENCLSimple) { - for (int i = 2 ; i < 3; ++i) { - TestSimple(3, 3, VALID, i); - } -} - -template -static void TestAligned(const int kernel_h, - const int kernel_w, - Padding padding, - const int dilation_rate) { - GenerateAndRunConv2D(3, 64, 32, 32, 128, kernel_h, kernel_w, padding, dilation_rate); -} - -template -static void TestUnAligned(const int kernel_h, - const int kernel_w, - Padding padding, - const int dilation_rate) { - srand(time(NULL)); - // generate random input - index_t batch = 3 + rand() % 10; - index_t input_channels = 3 + rand() % 10; - index_t height = 107; - index_t width = 113; - index_t output_channels = 3 + rand() % 10; - - GenerateAndRunConv2D(batch, input_channels, height, width, output_channels, - kernel_h, kernel_w, padding, dilation_rate); -} - -TEST_F(AtrousConv2dOpTest, UpSample) { - const int batch = 2; - const int channel = 2; - const int height = 3; - const int width = 3; - const int rate = 2; - std::vector filter_shape = {batch, channel, height, width}; - std::vector filter_data(batch*channel*height*width, 1); - std::vector upsampled_filter_shape(4, 0); - std::vector upsampled_filter_data; - UpSampleFilter(filter_shape, filter_data, rate, - upsampled_filter_shape, upsampled_filter_data); - int size = std::accumulate(upsampled_filter_shape.begin(), upsampled_filter_shape.end(), - 1, std::multiplies()); - const int expected_size = batch * channel * - (height + (height-1) * (rate - 1)) * - (width + (width-1) * (rate-1)); - EXPECT_EQ(expected_size, upsampled_filter_data.size()); -} - - -TEST_F(AtrousConv2dOpTest, CPUAligned) { - for (int i = 2 ; i < 4; ++i) { - TestAligned(3, 3, VALID, i); - TestAligned(3, 3, SAME, i); - } -} - -TEST_F(AtrousConv2dOpTest, OPENCLAligned) { - for (int i = 2 ; i < 4; ++i) { - TestAligned(3, 3, VALID, i); - TestAligned(3, 3, SAME, i); - } -} - -TEST_F(AtrousConv2dOpTest, CPUUnAligned) { - for (int i = 2 ; i < 4; ++i) { - TestUnAligned(3, 3, VALID, i); - TestUnAligned(3, 3, SAME, i); - } -} - diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc new file mode 100644 index 00000000..c70daa8d --- /dev/null +++ b/mace/ops/space_to_batch.cc @@ -0,0 +1,13 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/space_to_batch.h" + +namespace mace { + +REGISTER_CPU_OPERATOR(SpaceToBatchND, SpaceToBatchNDOp); + +REGISTER_OPENCL_OPERATOR(SpaceToBatchND, SpaceToBatchNDOp); + +} // namespace mace diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h new file mode 100644 index 00000000..079697d4 --- /dev/null +++ b/mace/ops/space_to_batch.h @@ -0,0 +1,76 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_OPS_SPACE_TO_BATCH_H_ +#define MACE_OPS_SPACE_TO_BATCH_H_ + +#include + +#include "mace/core/operator.h" +#include "mace/kernels/space_to_batch.h" + +namespace mace { + +static void SpaceToBatchHelper(const Tensor *input_tensor, + const Tensor *block_shape_tensor, + const Tensor *paddings_tensor, + Tensor *output) { + MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D"); + MACE_CHECK(block_shape_tensor->dim_size() == 1, "Block's shape should be 1D"); + MACE_CHECK(paddings_tensor->dim_size() == 2, "Paddings' shape should be 2D"); + + const index_t block_dims = block_shape_tensor->dim(0); + MACE_CHECK(block_dims == paddings_tensor->dim(0) && 2 == paddings_tensor->dim(1)); + Tensor::MappingGuard block_shape_tensor_mapper(block_shape_tensor); + Tensor::MappingGuard padding_tensor_mapper(paddings_tensor); + const int *block_shape_ptr = block_shape_tensor->data(); + const int *paddings_ptr = paddings_tensor->data(); + std::vector output_shape(4, 0); + index_t block_shape_product = 1; + for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) { + MACE_CHECK(block_shape_ptr[block_dim] > 1, "block_shape's value should be great to 1"); + const index_t block_shape_value = block_shape_ptr[block_dim]; + const index_t padded_input_size = input_tensor->dim(block_dim + 2) + + *paddings_ptr + + *(paddings_ptr+1); + MACE_CHECK(padded_input_size % block_shape_value == 0, + "padded input is not divisible by block_shape"); + block_shape_product *= block_shape_value; + output_shape[block_dim+2] = padded_input_size / block_shape_value; + paddings_ptr += 2; + } + output_shape[0] = input_tensor->dim(0) * block_shape_product; + output_shape[1] = input_tensor->dim(1); + + output->Resize(output_shape); +} + +template +class SpaceToBatchNDOp : public Operator { + public: + SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws) + : Operator(op_def, ws) {} + + bool Run() override { + const Tensor *input_tensor = this->Input(INPUT); + const Tensor *block_shape_tensor = this->Input(BLOCK_SHAPE); + const Tensor *paddings_tensor = this->Input(PADDINGS); + Tensor *output = this->Output(OUTPUT); + + SpaceToBatchHelper(input_tensor, block_shape_tensor, paddings_tensor, output); + functor_(const_cast(input_tensor), block_shape_tensor, paddings_tensor, output); + return true; + } + + private: + kernels::SpaceToBatchFunctor functor_; + + protected: + OP_INPUT_TAGS(INPUT, BLOCK_SHAPE, PADDINGS); + OP_OUTPUT_TAGS(OUTPUT); +}; + +} // namespace mace + +#endif // MACE_OPS_SPACE_TO_BATCH_H_ diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc new file mode 100644 index 00000000..5e119a04 --- /dev/null +++ b/mace/ops/space_to_batch_benchmark.cc @@ -0,0 +1,56 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/operator.h" +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +template +static void BMSpaceToBatch( + int iters, int batch, int channels, int height, int width) { + mace::testing::StopTiming(); + + OpsTestNet net; + OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") + .Input("Input") + .Input("BlockShape") + .Input("Padding") + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddInputFromArray( + "BlockShape", {2}, {2, 2}); + net.AddInputFromArray("Padding", {2, 2}, {2,3,2,3}); + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} + +#define BM_SPACE_TO_BATCH_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void BM_SPACE_TO_BATCH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BMSpaceToBatch(iters, N, C, H, W); \ + } \ + BENCHMARK(BM_SPACE_TO_BATCH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + +#define BM_SPACE_TO_BATCH(N, C, H, W, TYPE) \ + BM_SPACE_TO_BATCH_MACRO(N, C, H, W, TYPE, OPENCL); + +BM_SPACE_TO_BATCH(128, 128, 15, 15, float); +} // namespace mace \ No newline at end of file diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc index 7454a9ba..78e933df 100644 --- a/mace/ops/space_to_batch_test.cc +++ b/mace/ops/space_to_batch_test.cc @@ -2,30 +2,92 @@ // Copyright (c) 2017 XiaoMi All rights reserved. // -#include "mace/kernels/opencl/space_to_batch.h" #include "gtest/gtest.h" #include "mace/ops/ops_test_util.h" using namespace mace; +template +void RunSpaceToBatch(const std::vector &input_shape, + const std::vector &input_data, + const std::vector &block_shape_shape, + const std::vector &block_shape_data, + const std::vector &padding_shape, + const std::vector &padding_data, + const Tensor *expected) { + OpsTestNet net; + OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") + .Input("Input") + .Input("BlockShape") + .Input("Padding") + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddInputFromArray( + "Input", input_shape, input_data); + net.AddInputFromArray( + "BlockShape", block_shape_shape, block_shape_data); + net.AddInputFromArray("Padding", padding_shape, padding_data); + + // Run + net.RunOp(D); + + // Check + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-8); + +} + +template +void RunBatchToSpace(const std::vector &input_shape, + const std::vector &input_data, + const std::vector &block_shape_shape, + const std::vector &block_shape_data, + const std::vector &crops_shape, + const std::vector &crops_data, + const Tensor *expected) { + OpsTestNet net; + OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") + .Input("Input") + .Input("BlockShape") + .Input("Crops") + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddInputFromArray( + "Input", input_shape, input_data); + net.AddInputFromArray( + "BlockShape", block_shape_shape, block_shape_data); + net.AddInputFromArray("Crops", crops_shape, crops_data); + + // Run + net.RunOp(D); + + // Check + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-8); +} + template void TestBidirectionTransform(const std::vector &space_shape, - const std::vector &space, - const int block_height, - const int block_width, + const std::vector &space_data, + const std::vector &block_shape, + const std::vector &block_data, + const std::vector &padding_shape, + const std::vector &padding_data, const std::vector &batch_shape, - const std::vector &batch) { + const std::vector &batch_data) { auto space_tensor = unique_ptr(new Tensor(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum::v())); space_tensor->Resize(space_shape); { Tensor::MappingGuard space_mapper(space_tensor.get()); - T *space_data = space_tensor->mutable_data(); - MACE_CHECK(static_cast(space_tensor->size()) == space.size()) + T *space_ptr = space_tensor->mutable_data(); + MACE_CHECK(static_cast(space_tensor->size()) == space_data.size()) << "Space tensor size:" << space_tensor->size() - << ", space data size:" << space.size(); - memcpy(space_data, space.data(), space.size() * sizeof(T)); + << ", space data size:" << space_data.size(); + memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(T)); } auto batch_tensor = unique_ptr(new Tensor(GetDeviceAllocator(DeviceType::OPENCL), @@ -33,45 +95,65 @@ void TestBidirectionTransform(const std::vector &space_shape, batch_tensor->Resize(batch_shape); { Tensor::MappingGuard batch_mapper(batch_tensor.get()); - T *batch_data = batch_tensor->mutable_data(); - MACE_CHECK(static_cast(batch_tensor->size()) == batch.size()); - memcpy(batch_data, batch.data(), batch.size() * sizeof(T)); + T *batch_ptr = batch_tensor->mutable_data(); + MACE_CHECK(static_cast(batch_tensor->size()) == batch_data.size()); + memcpy(batch_ptr, batch_data.data(), batch_data.size() * sizeof(T)); } - auto inner_batch_tensor = unique_ptr(new Tensor(GetDeviceAllocator(DeviceType::OPENCL), - DataTypeToEnum::v())); - inner_batch_tensor->Resize(batch_shape); - kernels::SpaceToBatch(space_tensor.get(), block_height, block_width, - inner_batch_tensor.get(), nullptr, nullptr); - ExpectTensorNear(*batch_tensor, *inner_batch_tensor, 1e-8); - auto inner_space_tensor = unique_ptr(new Tensor(GetDeviceAllocator(DeviceType::OPENCL), - DataTypeToEnum::v())); - inner_space_tensor->Resize(space_shape); - kernels::SpaceToBatch(inner_space_tensor.get(), block_height, block_width, - batch_tensor.get(), nullptr, nullptr); - ExpectTensorNear(*space_tensor, *inner_space_tensor, 1e-8); + RunSpaceToBatch(space_shape, space_data, + block_shape, block_data, + padding_shape, padding_data, + batch_tensor.get()); + + RunBatchToSpace(batch_shape, batch_data, + block_shape, block_data, + padding_shape, padding_data, + space_tensor.get()); } -TEST(SpaceToBatchTest, NoTransform) { +TEST(SpaceToBatchTest, SmallData) { TestBidirectionTransform({1, 1, 2, 2}, {1,2,3,4}, - 1, 1, - {1,1,2,2}, - {1,2,3,4}); + {2}, + {2, 2}, + {2, 2}, + {0, 0, 0, 0}, + {4,1,1,1}, + {1,2,3,4} + ); } -TEST(SpaceToBatchTest, SmallData) { +TEST(SpaceToBatchTest, SmallDataWithOnePadding) { TestBidirectionTransform({1, 1, 2, 2}, {1,2,3,4}, - 2, 2, - {4,1,1,1}, - {1,2,3,4}); + {2}, + {3, 3}, + {2, 2}, + {1, 0, 1, 0}, + {9,1,1,1}, + {0,0,0,0,1,2,0,3,4} + ); +} + +TEST(SpaceToBatchTest, SmallDataWithTwoPadding) { + TestBidirectionTransform({1, 1, 2, 2}, + {1,2,3,4}, + {2}, + {2, 2}, + {2, 2}, + {1, 1, 1, 1}, + {4,1,2,2}, + {0,0,0,4,0,0,3,0,0,2,0,0,1,0,0,0} + ); } TEST(SpaceToBatchTest, MultiChannelData) { TestBidirectionTransform({1, 3, 2, 2}, {1,2,3,4,5,6,7,8,9,10,11,12}, - 2, 2, + {2}, + {2, 2}, + {2, 2}, + {0, 0, 0, 0}, {4,3,1,1}, {1,5,9,2,6,10,3,7,11,4,8,12} ); @@ -80,7 +162,10 @@ TEST(SpaceToBatchTest, MultiChannelData) { TEST(SpaceToBatchTest, LargerMultiChannelData) { TestBidirectionTransform({1, 1, 4, 4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, - 2, 2, + {2}, + {2, 2}, + {2, 2}, + {0, 0, 0, 0}, {4,1,2,2}, {1,3,9,11,2,4,10,12,5,7,13,15,6,8,14,16} ); @@ -89,7 +174,10 @@ TEST(SpaceToBatchTest, LargerMultiChannelData) { TEST(SpaceToBatchTest, MultiBatchData) { TestBidirectionTransform({2, 1, 2, 4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, - 2, 2, + {2}, + {2, 2}, + {2, 2}, + {0, 0, 0, 0}, {8,1,1,2}, {1,3,2,4,5,7,6,8,9,11,10,12,13,15,14,16} ); @@ -99,7 +187,10 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) { TestBidirectionTransform({2, 2, 2, 4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16, 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}, - 2, 2, + {2}, + {2, 2}, + {2, 2}, + {0, 0, 0, 0}, {8,2,1,2}, {1,3,9,11,2,4,10,12,5,7,13,15,6,8,14,16, 17,19,25,27,18,20,26,28,21,23,29,31,22,24,30,32} -- GitLab