提交 a74060eb 编写于 作者: 李寅

Improve batchspace performance

上级 dbbf8596
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <memory> #include <memory>
#include <vector> #include <vector>
#include <algorithm>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
...@@ -160,10 +161,16 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -160,10 +161,16 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
Tensor::MappingGuard input_guard(space_tensor); Tensor::MappingGuard input_guard(space_tensor);
Tensor::MappingGuard output_guard(batch_tensor); Tensor::MappingGuard output_guard(batch_tensor);
int pad_top = paddings_[0];
int pad_left = paddings_[2];
int block_shape_h = block_shape_[0];
int block_shape_w = block_shape_[1];
if (b2s_) { if (b2s_) {
const float *input_data = batch_tensor->data<float>(); const float *input_data = batch_tensor->data<float>();
float *output_data = space_tensor->mutable_data<float>(); float *output_data = space_tensor->mutable_data<float>();
index_t in_batches = batch_tensor->dim(0);
index_t in_height = batch_tensor->dim(2); index_t in_height = batch_tensor->dim(2);
index_t in_width = batch_tensor->dim(3); index_t in_width = batch_tensor->dim(3);
...@@ -172,26 +179,58 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -172,26 +179,58 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
index_t out_height = space_tensor->dim(2); index_t out_height = space_tensor->dim(2);
index_t out_width = space_tensor->dim(3); index_t out_width = space_tensor->dim(3);
#pragma omp parallel for collapse(2) // 32k/sizeof(float)/out_width/block_shape
for (index_t b = 0; b < out_batches; ++b) { index_t
for (index_t c = 0; c < channels; ++c) { block_h_size =
for (index_t h = 0; h < out_height; ++h) { std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
const index_t in_h = (h + paddings_[0]) / block_shape_[0];
const index_t tile_h = (h + paddings_[0]) % block_shape_[0]; // make channel outter loop so we can make best use of cache
for (index_t w = 0; w < out_width; ++w) { #pragma omp parallel for collapse(3)
const index_t in_w = (w + paddings_[2]) / block_shape_[1]; for (index_t c = 0; c < channels; ++c) {
const index_t tile_w = (w + paddings_[2]) % block_shape_[1]; for (index_t block_h = 0; block_h < in_height;
const index_t block_h += block_h_size) {
in_b = (tile_h * block_shape_[1] + tile_w) * out_batches + b; for (index_t in_b = 0; in_b < in_batches; ++in_b) {
output_data[((b * channels + c) * out_height + h) * out_width const index_t b = in_b % out_batches;
+ w] = const index_t tile_index = in_b / out_batches;
input_data[ const index_t tile_h = tile_index / block_shape_w;
((in_b * channels + c) * in_height + in_h) * in_width const index_t tile_w = tile_index % block_shape_w;
+ in_w]; const index_t valid_h_start = std::max(block_h,
} (pad_top - tile_h
} + block_shape_h - 1)
} / block_shape_h);
} const index_t valid_h_end = std::min(in_height,
std::min(
block_h + block_h_size,
(out_height + pad_top
- tile_h
+ block_shape_h - 1)
/ block_shape_h));
const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const index_t valid_w_end = std::min(in_width,
(out_width + pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const float *input_base =
input_data + (in_b * channels + c) * in_height * in_width;
float *output_base =
output_data + (b * channels + c) * out_height * out_width;
index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) {
index_t w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) {
output_base[h * out_width + w] =
input_base[in_h * in_width + in_w];
w += block_shape_w;
} // w
h += block_shape_h;
} // h
} // b
} // block_h
} // c
} else { } else {
const float *input_data = space_tensor->data<float>(); const float *input_data = space_tensor->data<float>();
float *output_data = batch_tensor->mutable_data<float>(); float *output_data = batch_tensor->mutable_data<float>();
...@@ -205,31 +244,73 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -205,31 +244,73 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
index_t out_height = batch_tensor->dim(2); index_t out_height = batch_tensor->dim(2);
index_t out_width = batch_tensor->dim(3); index_t out_width = batch_tensor->dim(3);
#pragma omp parallel for collapse(2) index_t block_h_size =
for (index_t b = 0; b < out_batches; ++b) { std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
for (index_t c = 0; c < channels; ++c) {
const index_t in_b = b % in_batches; // make channel outter loop so we can make best use of cache
const index_t tile_h = b / in_batches / block_shape_[1]; #pragma omp parallel for collapse(3)
const index_t tile_w = b / in_batches % block_shape_[1]; for (index_t c = 0; c < channels; ++c) {
for (index_t h = 0; h < out_height; ++h) { for (index_t block_h = 0; block_h < out_height;
const index_t in_h = h * block_shape_[0] + tile_h - paddings_[0]; block_h += block_h_size) {
for (index_t w = 0; w < out_width; ++w) { for (index_t b = 0; b < out_batches; ++b) {
const index_t in_w = w * block_shape_[1] + tile_w - paddings_[2]; const index_t in_b = b % in_batches;
if (in_h >= 0 && in_w >= 0 && in_h < in_height const index_t tile_index = b / in_batches;
&& in_w < in_width) { const index_t tile_h = tile_index / block_shape_w;
output_data[((b * channels + c) * out_height + h) * out_width const index_t tile_w = tile_index % block_shape_w;
+ w] = const index_t valid_h_start = std::max(block_h,
input_data[ (pad_top - tile_h
((in_b * channels + c) * in_height + in_h) * in_width + block_shape_h - 1)
+ in_w]; / block_shape_h);
} else { const index_t valid_h_end = std::min(out_height,
output_data[((b * channels + c) * out_height + h) * out_width std::min(
+ w] = 0; block_h + block_h_size,
} (in_height + pad_top
} - tile_h
} + block_shape_h - 1)
} / block_shape_h));
} const index_t valid_w_start = std::max(static_cast<index_t>(0),
(pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const index_t valid_w_end = std::min(out_width,
(in_width + pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const float *input_base =
input_data + (in_b * channels + c) * in_height * in_width;
float *output_base =
output_data + (b * channels + c) * out_height * out_width;
memset(output_base + block_h * out_width,
0,
(valid_h_start - block_h) * out_width * sizeof(float));
index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top;
for (index_t h = valid_h_start; h < valid_h_end; ++h) {
memset(output_base + h * out_width,
0,
valid_w_start * sizeof(float));
index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t w = valid_w_start; w < valid_w_end; ++w) {
output_base[h * out_width + w] =
input_base[in_h * in_width + in_w];
in_w += block_shape_w;
} // w
in_h += block_shape_h;
memset(output_base + h * out_width + valid_w_end,
0,
(out_width - valid_w_end) * sizeof(float));
} // h
memset(output_base + valid_h_end * out_width,
0,
(std::min(out_height, block_h + block_h_size) - valid_h_end)
* out_width * sizeof(float));
} // b
} // block_h
} // c
} }
} }
}; };
......
...@@ -27,16 +27,29 @@ void BMBatchToSpace( ...@@ -27,16 +27,29 @@ void BMBatchToSpace(
mace::testing::StopTiming(); mace::testing::StopTiming();
OpsTestNet net; OpsTestNet net;
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); if (D == DeviceType::CPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
kernels::BufferType::IN_OUT_CHANNEL); } else if (D == DeviceType::GPU) {
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
.Input("InputImage") }
.Output("OutputImage")
.AddIntsArg("crops", {0, 0, 0, 0})
.AddIntsArg("block_shape", {arg, arg})
.Finalize(net.NewOperatorDef());
if (D == DeviceType::CPU) {
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("Input")
.Output("Output")
.AddIntsArg("crops", {0, 0, 0, 0})
.AddIntsArg("block_shape", {arg, arg})
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("crops", {0, 0, 0, 0})
.AddIntsArg("block_shape", {arg, arg})
.Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.RunOp(D); net.RunOp(D);
...@@ -63,11 +76,13 @@ void BMBatchToSpace( ...@@ -63,11 +76,13 @@ void BMBatchToSpace(
BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE) BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
#define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \ #define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU);
BM_BATCH_TO_SPACE(128, 8, 8, 128, 2); BM_BATCH_TO_SPACE(128, 8, 8, 128, 2);
BM_BATCH_TO_SPACE(4, 128, 128, 32, 2); BM_BATCH_TO_SPACE(4, 128, 128, 32, 2);
BM_BATCH_TO_SPACE(16, 64, 64, 32, 4); BM_BATCH_TO_SPACE(16, 64, 64, 32, 4);
BM_BATCH_TO_SPACE(64, 32, 32, 32, 8);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -27,17 +27,29 @@ void BMSpaceToBatch( ...@@ -27,17 +27,29 @@ void BMSpaceToBatch(
mace::testing::StopTiming(); mace::testing::StopTiming();
OpsTestNet net; OpsTestNet net;
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
BufferToImage<D, float>(&net, "Input", "InputImage", } else if (D == DeviceType::GPU) {
kernels::BufferType::IN_OUT_CHANNEL); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") }
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("paddings", {shape, shape, shape, shape})
.AddIntsArg("block_shape", {shape, shape})
.Finalize(net.NewOperatorDef());
if (D == DeviceType::CPU) {
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("Input")
.Output("Output")
.AddIntsArg("paddings", {shape, shape, shape, shape})
.AddIntsArg("block_shape", {shape, shape})
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("paddings", {shape, shape, shape, shape})
.AddIntsArg("block_shape", {shape, shape})
.Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.RunOp(D); net.RunOp(D);
...@@ -65,11 +77,14 @@ void BMSpaceToBatch( ...@@ -65,11 +77,14 @@ void BMSpaceToBatch(
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE) BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
#define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \ #define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU);
BM_SPACE_TO_BATCH(128, 16, 16, 128, 2); BM_SPACE_TO_BATCH(128, 16, 16, 128, 2);
BM_SPACE_TO_BATCH(1, 256, 256, 32, 2); BM_SPACE_TO_BATCH(1, 256, 256, 32, 2);
BM_SPACE_TO_BATCH(1, 256, 256, 16, 2);
BM_SPACE_TO_BATCH(1, 256, 256, 32, 4); BM_SPACE_TO_BATCH(1, 256, 256, 32, 4);
BM_SPACE_TO_BATCH(1, 256, 256, 32, 8);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -213,6 +213,96 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) { ...@@ -213,6 +213,96 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32}); 9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32});
} }
void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
const std::vector<int> &block_shape_data,
const std::vector<int> &padding_data) {
OpsTestNet net;
net.AddRandomInput<GPU, float>("Input", input_shape);
// run gpu
BufferToImage<GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(GPU);
ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU",
kernels::BufferType::IN_OUT_CHANNEL);
// run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("paddings", padding_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"OutputCPU",
NHWC);
// Check
ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
*net.GetOutput("OutputGPU"));
}
void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
const std::vector<int> &block_shape_data,
const std::vector<int> &crops_data) {
OpsTestNet net;
net.AddRandomInput<GPU, float>("Input", input_shape);
// run gpu
BufferToImage<GPU, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(GPU);
ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU",
kernels::BufferType::IN_OUT_CHANNEL);
// run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input",
NHWC,
"InputNCHW",
NCHW);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
.AddIntsArg("crops", crops_data)
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
NCHW,
"OutputCPU",
NHWC);
// Check
ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
*net.GetOutput("OutputGPU"));
}
TEST(SpaceToBatchTest, LargeData) {
TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0});
TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4});
TestoBatchToSpaceLargeInput({64, 32, 32, 32}, {8, 8}, {0, 0, 0, 0});
TestoBatchToSpaceLargeInput({64, 32, 32, 32}, {8, 8}, {4, 4, 4, 4});
}
} // namespace test } // namespace test
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册