diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h index 90103b4fc1ccd9203005d9a8fe61915a69b5f652..46a0e0318b26c24cb230d32b2eea41751a55a012 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.h @@ -17,6 +17,7 @@ #include #include +#include #include "mace/core/future.h" #include "mace/core/tensor.h" @@ -160,10 +161,16 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { Tensor::MappingGuard input_guard(space_tensor); Tensor::MappingGuard output_guard(batch_tensor); + int pad_top = paddings_[0]; + int pad_left = paddings_[2]; + int block_shape_h = block_shape_[0]; + int block_shape_w = block_shape_[1]; + if (b2s_) { const float *input_data = batch_tensor->data(); float *output_data = space_tensor->mutable_data(); + index_t in_batches = batch_tensor->dim(0); index_t in_height = batch_tensor->dim(2); index_t in_width = batch_tensor->dim(3); @@ -172,26 +179,58 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { index_t out_height = space_tensor->dim(2); index_t out_width = space_tensor->dim(3); -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < out_batches; ++b) { - for (index_t c = 0; c < channels; ++c) { - for (index_t h = 0; h < out_height; ++h) { - const index_t in_h = (h + paddings_[0]) / block_shape_[0]; - const index_t tile_h = (h + paddings_[0]) % block_shape_[0]; - for (index_t w = 0; w < out_width; ++w) { - const index_t in_w = (w + paddings_[2]) / block_shape_[1]; - const index_t tile_w = (w + paddings_[2]) % block_shape_[1]; - const index_t - in_b = (tile_h * block_shape_[1] + tile_w) * out_batches + b; - output_data[((b * channels + c) * out_height + h) * out_width - + w] = - input_data[ - ((in_b * channels + c) * in_height + in_h) * in_width - + in_w]; - } - } - } - } + // 32k/sizeof(float)/out_width/block_shape + index_t + block_h_size = + std::max(static_cast(1), 8 * 1024 / block_shape_w / out_width); + + // make channel outter loop so we can make best use of cache +#pragma omp parallel for collapse(3) + for (index_t c = 0; c < channels; ++c) { + for (index_t block_h = 0; block_h < in_height; + block_h += block_h_size) { + for (index_t in_b = 0; in_b < in_batches; ++in_b) { + const index_t b = in_b % out_batches; + const index_t tile_index = in_b / out_batches; + const index_t tile_h = tile_index / block_shape_w; + const index_t tile_w = tile_index % block_shape_w; + const index_t valid_h_start = std::max(block_h, + (pad_top - tile_h + + block_shape_h - 1) + / block_shape_h); + const index_t valid_h_end = std::min(in_height, + std::min( + block_h + block_h_size, + (out_height + pad_top + - tile_h + + block_shape_h - 1) + / block_shape_h)); + const index_t valid_w_start = std::max(static_cast(0), + (pad_left - tile_w + + block_shape_w - 1) + / block_shape_w); + const index_t valid_w_end = std::min(in_width, + (out_width + pad_left - tile_w + + block_shape_w - 1) + / block_shape_w); + const float *input_base = + input_data + (in_b * channels + c) * in_height * in_width; + float *output_base = + output_data + (b * channels + c) * out_height * out_width; + + index_t h = valid_h_start * block_shape_h + tile_h - pad_top; + for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) { + index_t w = valid_w_start * block_shape_w + tile_w - pad_left; + for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) { + output_base[h * out_width + w] = + input_base[in_h * in_width + in_w]; + w += block_shape_w; + } // w + h += block_shape_h; + } // h + } // b + } // block_h + } // c } else { const float *input_data = space_tensor->data(); float *output_data = batch_tensor->mutable_data(); @@ -205,31 +244,73 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { index_t out_height = batch_tensor->dim(2); index_t out_width = batch_tensor->dim(3); -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < out_batches; ++b) { - for (index_t c = 0; c < channels; ++c) { - const index_t in_b = b % in_batches; - const index_t tile_h = b / in_batches / block_shape_[1]; - const index_t tile_w = b / in_batches % block_shape_[1]; - for (index_t h = 0; h < out_height; ++h) { - const index_t in_h = h * block_shape_[0] + tile_h - paddings_[0]; - for (index_t w = 0; w < out_width; ++w) { - const index_t in_w = w * block_shape_[1] + tile_w - paddings_[2]; - if (in_h >= 0 && in_w >= 0 && in_h < in_height - && in_w < in_width) { - output_data[((b * channels + c) * out_height + h) * out_width - + w] = - input_data[ - ((in_b * channels + c) * in_height + in_h) * in_width - + in_w]; - } else { - output_data[((b * channels + c) * out_height + h) * out_width - + w] = 0; - } - } - } - } - } + index_t block_h_size = + std::max(static_cast(1), 8 * 1024 / block_shape_w / in_width); + + // make channel outter loop so we can make best use of cache +#pragma omp parallel for collapse(3) + for (index_t c = 0; c < channels; ++c) { + for (index_t block_h = 0; block_h < out_height; + block_h += block_h_size) { + for (index_t b = 0; b < out_batches; ++b) { + const index_t in_b = b % in_batches; + const index_t tile_index = b / in_batches; + const index_t tile_h = tile_index / block_shape_w; + const index_t tile_w = tile_index % block_shape_w; + const index_t valid_h_start = std::max(block_h, + (pad_top - tile_h + + block_shape_h - 1) + / block_shape_h); + const index_t valid_h_end = std::min(out_height, + std::min( + block_h + block_h_size, + (in_height + pad_top + - tile_h + + block_shape_h - 1) + / block_shape_h)); + const index_t valid_w_start = std::max(static_cast(0), + (pad_left - tile_w + + block_shape_w - 1) + / block_shape_w); + const index_t valid_w_end = std::min(out_width, + (in_width + pad_left - tile_w + + block_shape_w - 1) + / block_shape_w); + const float *input_base = + input_data + (in_b * channels + c) * in_height * in_width; + float *output_base = + output_data + (b * channels + c) * out_height * out_width; + + memset(output_base + block_h * out_width, + 0, + (valid_h_start - block_h) * out_width * sizeof(float)); + + index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top; + for (index_t h = valid_h_start; h < valid_h_end; ++h) { + memset(output_base + h * out_width, + 0, + valid_w_start * sizeof(float)); + + index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left; + for (index_t w = valid_w_start; w < valid_w_end; ++w) { + output_base[h * out_width + w] = + input_base[in_h * in_width + in_w]; + in_w += block_shape_w; + } // w + in_h += block_shape_h; + + memset(output_base + h * out_width + valid_w_end, + 0, + (out_width - valid_w_end) * sizeof(float)); + } // h + + memset(output_base + valid_h_end * out_width, + 0, + (std::min(out_height, block_h + block_h_size) - valid_h_end) + * out_width * sizeof(float)); + } // b + } // block_h + } // c } } }; diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc index 5cfe7015cf531d73c22af4f0ac6ccc3b3292d9bc..fdb7331706afc7635f103bf02e29b30ca29c8cbb 100644 --- a/mace/ops/batch_to_space_benchmark.cc +++ b/mace/ops/batch_to_space_benchmark.cc @@ -27,16 +27,29 @@ void BMBatchToSpace( mace::testing::StopTiming(); OpsTestNet net; - net.AddRandomInput("Input", {batch, height, width, channels}); - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") - .Input("InputImage") - .Output("OutputImage") - .AddIntsArg("crops", {0, 0, 0, 0}) - .AddIntsArg("block_shape", {arg, arg}) - .Finalize(net.NewOperatorDef()); + if (D == DeviceType::CPU) { + net.AddRandomInput("Input", {batch, channels, height, width}); + } else if (D == DeviceType::GPU) { + net.AddRandomInput("Input", {batch, height, width, channels}); + } + if (D == DeviceType::CPU) { + OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") + .Input("Input") + .Output("Output") + .AddIntsArg("crops", {0, 0, 0, 0}) + .AddIntsArg("block_shape", {arg, arg}) + .Finalize(net.NewOperatorDef()); + } else if (D == DeviceType::GPU) { + BufferToImage(&net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntsArg("crops", {0, 0, 0, 0}) + .AddIntsArg("block_shape", {arg, arg}) + .Finalize(net.NewOperatorDef()); + } // Warm-up for (int i = 0; i < 5; ++i) { net.RunOp(D); @@ -63,11 +76,13 @@ void BMBatchToSpace( BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE) #define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \ - BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); + BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \ + BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU); BM_BATCH_TO_SPACE(128, 8, 8, 128, 2); BM_BATCH_TO_SPACE(4, 128, 128, 32, 2); BM_BATCH_TO_SPACE(16, 64, 64, 32, 4); +BM_BATCH_TO_SPACE(64, 32, 32, 32, 8); } // namespace test } // namespace ops diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc index ff3ee74925bcbafe037c7af4e732aeb861e0a1e9..524a32cec2849657c74d37453047e33909cfc6f5 100644 --- a/mace/ops/space_to_batch_benchmark.cc +++ b/mace/ops/space_to_batch_benchmark.cc @@ -27,17 +27,29 @@ void BMSpaceToBatch( mace::testing::StopTiming(); OpsTestNet net; - net.AddRandomInput("Input", {batch, height, width, channels}); - - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") - .Input("InputImage") - .Output("OutputImage") - .AddIntsArg("paddings", {shape, shape, shape, shape}) - .AddIntsArg("block_shape", {shape, shape}) - .Finalize(net.NewOperatorDef()); + if (D == DeviceType::CPU) { + net.AddRandomInput("Input", {batch, channels, height, width}); + } else if (D == DeviceType::GPU) { + net.AddRandomInput("Input", {batch, height, width, channels}); + } + if (D == DeviceType::CPU) { + OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") + .Input("Input") + .Output("Output") + .AddIntsArg("paddings", {shape, shape, shape, shape}) + .AddIntsArg("block_shape", {shape, shape}) + .Finalize(net.NewOperatorDef()); + } else if (D == DeviceType::GPU) { + BufferToImage(&net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntsArg("paddings", {shape, shape, shape, shape}) + .AddIntsArg("block_shape", {shape, shape}) + .Finalize(net.NewOperatorDef()); + } // Warm-up for (int i = 0; i < 5; ++i) { net.RunOp(D); @@ -65,11 +77,14 @@ void BMSpaceToBatch( BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE) #define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \ - BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); + BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \ + BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU); BM_SPACE_TO_BATCH(128, 16, 16, 128, 2); BM_SPACE_TO_BATCH(1, 256, 256, 32, 2); +BM_SPACE_TO_BATCH(1, 256, 256, 16, 2); BM_SPACE_TO_BATCH(1, 256, 256, 32, 4); +BM_SPACE_TO_BATCH(1, 256, 256, 32, 8); } // namespace test } // namespace ops diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc index b464c41537e825e1cead227b29ed663582686dee..cca39bafc75a082eab41db042501795e2f3de282 100644 --- a/mace/ops/space_to_batch_test.cc +++ b/mace/ops/space_to_batch_test.cc @@ -213,6 +213,96 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) { 9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32}); } +void TestSpaceToBatchLargeInput(const std::vector &input_shape, + const std::vector &block_shape_data, + const std::vector &padding_data) { + OpsTestNet net; + net.AddRandomInput("Input", input_shape); + + // run gpu + BufferToImage(&net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntsArg("paddings", padding_data) + .AddIntsArg("block_shape", block_shape_data) + .Finalize(net.NewOperatorDef()); + net.RunOp(GPU); + ImageToBuffer(&net, "OutputImage", "OutputGPU", + kernels::BufferType::IN_OUT_CHANNEL); + + // run cpu + net.TransformDataFormat("Input", + NHWC, + "InputNCHW", + NCHW); + OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntsArg("paddings", padding_data) + .AddIntsArg("block_shape", block_shape_data) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + net.TransformDataFormat("OutputNCHW", + NCHW, + "OutputCPU", + NHWC); + + // Check + ExpectTensorNear(*net.GetOutput("OutputCPU"), + *net.GetOutput("OutputGPU")); +} + +void TestoBatchToSpaceLargeInput(const std::vector &input_shape, + const std::vector &block_shape_data, + const std::vector &crops_data) { + OpsTestNet net; + net.AddRandomInput("Input", input_shape); + + // run gpu + BufferToImage(&net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntsArg("crops", crops_data) + .AddIntsArg("block_shape", block_shape_data) + .Finalize(net.NewOperatorDef()); + net.RunOp(GPU); + ImageToBuffer(&net, "OutputImage", "OutputGPU", + kernels::BufferType::IN_OUT_CHANNEL); + + // run cpu + net.TransformDataFormat("Input", + NHWC, + "InputNCHW", + NCHW); + OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntsArg("crops", crops_data) + .AddIntsArg("block_shape", block_shape_data) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + net.TransformDataFormat("OutputNCHW", + NCHW, + "OutputCPU", + NHWC); + + // Check + ExpectTensorNear(*net.GetOutput("OutputCPU"), + *net.GetOutput("OutputGPU")); +} + + +TEST(SpaceToBatchTest, LargeData) { + TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0}); + TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4}); + TestoBatchToSpaceLargeInput({64, 32, 32, 32}, {8, 8}, {0, 0, 0, 0}); + TestoBatchToSpaceLargeInput({64, 32, 32, 32}, {8, 8}, {4, 4, 4, 4}); +} + } // namespace test } // namespace ops } // namespace mace