提交 241b8fd8 编写于 作者: L liuqi

Finish batch to space and reverse op.

上级 7b428196
void kernel space_to_batch(global float* space_data_ptr,
global const int* block_shape_ptr,
global const int* paddings_ptr,
private const int space_batch,
private const int space_channel,
private const int space_height,
private const int space_width,
private const int block_height,
private const int block_width,
private const int batch_height,
private const int batch_width,
private const int b2s,
global float* batch_data_ptr) {
int batch_idx = get_global_id(0);
int batch_channel_idx = get_global_id(1);
int batch_pixel_idx = get_global_id(2);
const int batch_height = space_height / block_height;
const int batch_width = space_width / block_width;
const int block_height = block_shape_ptr[0];
const int block_width = block_shape_ptr[1];
const int padding_height_start = paddings_ptr[0];
const int padding_width_start = paddings_ptr[2];
const int batch_pixel_height_idx = batch_pixel_idx / batch_width;
const int batch_pixel_width_idx = batch_pixel_idx % batch_width;
const int block_size = block_height * block_width;
const int space_idx = batch_idx / block_size;
const int remaining_batch_idx = batch_idx % block_size;
const int space_pixel_height_idx = (remaining_batch_idx / block_width) +
batch_pixel_height_idx * block_height;
const int space_pixel_width_idx = (remaining_batch_idx % block_width) +
batch_pixel_width_idx * block_width;
int space_pixel_height_idx = (remaining_batch_idx / block_width) +
batch_pixel_height_idx * block_height;
int space_pixel_width_idx = (remaining_batch_idx % block_width) +
batch_pixel_width_idx * block_width;
const int batch_data_offset = batch_idx * (space_channel * batch_height * batch_width) +
(batch_channel_idx * batch_height * batch_width) +
batch_pixel_height_idx * batch_width +
batch_pixel_width_idx;
space_pixel_height_idx -= padding_height_start;
space_pixel_width_idx -= padding_width_start;
const int space_data_offset = space_idx * (space_channel * space_height * space_width) +
(batch_channel_idx * space_height * space_width) +
space_pixel_height_idx * space_width +
space_pixel_width_idx;
if (b2s) {
*(space_data_ptr + space_data_offset) = *(batch_data_ptr + batch_data_offset);
if (space_pixel_height_idx < 0 || space_pixel_height_idx >= space_height ||
space_pixel_width_idx < 0 || space_pixel_width_idx >= space_width) {
if (!b2s) {
*(batch_data_ptr + batch_data_offset) = 0;
}
} else {
*(batch_data_ptr + batch_data_offset) = *(space_data_ptr + space_data_offset);
if (b2s) {
*(space_data_ptr + space_data_offset) = *(batch_data_ptr + batch_data_offset);
} else {
*(batch_data_ptr + batch_data_offset) = *(space_data_ptr + space_data_offset);
}
}
}
......@@ -8,24 +8,20 @@ namespace mace {
namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int dilation_height,
const int dilation_width, Tensor *output);
const Tensor *bias, Tensor *output);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int dilation_height,
const int dilation_width, Tensor *output);
const Tensor *bias, Tensor *output);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int dilation_height,
const int dilation_width, Tensor *output);
const Tensor *bias, Tensor *output);
template <>
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int dilation_height,
const int dilation_width, Tensor *output);
const Tensor *bias, Tensor *output);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, nullptr},
......@@ -37,7 +33,8 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
index_t kernel_h = filter->shape()[2];
index_t kernel_w = filter->shape()[3];
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
strides_[0] > 2 || selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
......@@ -53,9 +50,9 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
Tensor::MappingGuard input_mapper(input);
ConstructInputWithPadding(input->data<float>(), input->shape().data(), paddings_.data(),
&padded_input);
conv2d_func(&padded_input, filter, bias, dilations_[0], dilations_[1], output);
conv2d_func(&padded_input, filter, bias, output);
}else {
conv2d_func(input, filter, bias, dilations_[0], dilations_[1], output);
conv2d_func(input, filter, bias, output);
}
}
......
......@@ -7,7 +7,6 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/utils.h"
#include "mace/core/macros.h"
namespace mace {
namespace kernels {
......@@ -174,11 +173,7 @@ void Conv1x1V3(const Tensor *input,
extern void Conv2dOpenclK1x1S1(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int dilation_height,
const int dilation_width,
Tensor *output) {
MACE_UNUSED(dilation_height);
MACE_UNUSED(dilation_width);
const index_t batch = output->shape()[0];
const index_t height = output->shape()[2];
const index_t width = output->shape()[3];
......
......@@ -3,19 +3,14 @@
//
#include "mace/core/common.h"
#include "mace/core/macros.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/space_to_batch.h"
namespace mace {
namespace kernels {
static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
const Tensor *bias, const uint32_t stride,
Tensor *output, const std::vector<cl::Event> *waiting_events,
cl::Event *ret_event) {
const Tensor *bias, const uint32_t stride, Tensor *output) {
const index_t channels = output->shape()[1];
const index_t height = output->shape()[2];
const index_t width = output->shape()[3];
......@@ -51,75 +46,18 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
waiting_events,
ret_event);
cl::NDRange(lws[0], lws[1], lws[2]));
MACE_CHECK(error == CL_SUCCESS);
}
static void CalOutputShape(const std::vector<index_t> &input_shape,
const std::vector<index_t> &filter_shape,
const int dilation_height,
const int dilation_width,
std::vector<index_t> &output_shape) {
index_t kernel_height = filter_shape[2];
index_t kernel_width = filter_shape[3];
index_t output_channels = filter_shape[0];
index_t k_extent_height = (kernel_height - 1) * dilation_height + 1;
index_t k_extent_width = (kernel_width - 1) * dilation_width + 1;
index_t output_height = input_shape[2] - k_extent_height + 1;
index_t output_width = input_shape[3] - k_extent_width + 1;
output_shape[0] = input_shape[0];
output_shape[1] = output_channels;
output_shape[2] = output_height;
output_shape[3] = output_width;
}
static void ResizeBatchTensor(const std::vector<index_t> &input_shape,
const int dilation_height,
const int dilation_width,
Tensor *batch_tensor) {
LOG(INFO) << input_shape[2] << "\t" << input_shape[3] << "\t" <<dilation_height;
batch_tensor->Resize({input_shape[0] * dilation_height * dilation_width,
input_shape[1],
input_shape[2] / dilation_height,
input_shape[3] / dilation_width}
);
LOG(INFO) << batch_tensor->dim(2) << "\t" << batch_tensor->dim(3) << "\t" <<dilation_width;
}
void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int dilation_height,
const int dilation_width, Tensor *output) {
if (dilation_height > 1 && dilation_width > 1) {
cl::Event events[2];
Tensor reshaped_input_tensor(GetDeviceAllocator(DeviceType::OPENCL), input->dtype());
ResizeBatchTensor(input->shape(), dilation_height, dilation_width, &reshaped_input_tensor);
SpaceToBatch(const_cast<Tensor*>(input), dilation_height, dilation_width,
&reshaped_input_tensor, nullptr, &events[0]);
Tensor reshaped_output_tensor(GetDeviceAllocator(DeviceType::OPENCL), input->dtype());
std::vector<index_t> reshaped_output_shape(4, 0);
CalOutputShape(reshaped_input_tensor.shape(), filter->shape(),
dilation_height, dilation_width, reshaped_output_shape);
reshaped_output_tensor.Resize(reshaped_output_shape);
std::vector<cl::Event> s2b_events(1, events[0]);
InnerConv2dK3x3S12(&reshaped_input_tensor, filter, bias, 1, &reshaped_output_tensor,
&s2b_events, &events[1]);
std::vector<cl::Event> conv_events(1, events[1]);
SpaceToBatch<true>(&reshaped_output_tensor, dilation_height, dilation_width,
output, &conv_events, nullptr);
} else {
InnerConv2dK3x3S12(input, filter, bias, 1, output, nullptr, nullptr);
}
const Tensor *bias, Tensor *output) {
InnerConv2dK3x3S12(input, filter, bias, 1, output);
};
void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int dilation_height,
const int dilation_width, Tensor *output) {
MACE_UNUSED(dilation_height);
MACE_UNUSED(dilation_width);
InnerConv2dK3x3S12(input, filter, bias, 2, output, nullptr, nullptr);
const Tensor *bias, Tensor *output) {
InnerConv2dK3x3S12(input, filter, bias, 2, output);
};
} // namespace kernels
......
......@@ -7,32 +7,33 @@
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h"
#include "mace/kernels/space_to_batch.h"
namespace mace {
namespace kernels {
template <bool B2S = false>
void SpaceToBatch(Tensor *space_tensor,
const int block_height,
const int block_width,
Tensor *batch_tensor,
const std::vector<cl::Event> *waiting_events,
cl::Event *event) {
template <>
void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_tensor,
const Tensor *block_shape_tensor,
const Tensor *paddings_tensor,
Tensor *batch_tensor) {
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto s2b_kernel = cl::Kernel(program, "space_to_batch");
uint32_t idx = 0;
s2b_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(space_tensor->buffer())));
s2b_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(space_tensor->buffer())));
s2b_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(block_shape_tensor->buffer())));
s2b_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(paddings_tensor->buffer())));
s2b_kernel.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
s2b_kernel.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
s2b_kernel.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
s2b_kernel.setArg(idx++, static_cast<int32_t>(space_tensor->dim(3)));
s2b_kernel.setArg(idx++, block_height);
s2b_kernel.setArg(idx++, block_width);
s2b_kernel.setArg(idx++, static_cast<int32_t>(B2S));
s2b_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(batch_tensor->buffer())));
s2b_kernel.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
s2b_kernel.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(3)));
s2b_kernel.setArg(idx++, static_cast<int32_t>(b2s_));
s2b_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(batch_tensor->buffer())));
const uint32_t gws[3] = {static_cast<uint32_t>(batch_tensor->dim(0)),
static_cast<uint32_t>(batch_tensor->dim(1)),
......@@ -43,9 +44,7 @@ void SpaceToBatch(Tensor *space_tensor,
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
s2b_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
waiting_events,
event);
cl::NDRange(lws[0], lws[1], lws[2]));
MACE_CHECK(error == CL_SUCCESS);
}
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_CONV_2D_H_
#define MACE_KERNELS_CONV_2D_H_
#include "mace/core/tensor.h"
#include "mace/proto/mace.pb.h"
namespace mace {
namespace kernels {
template <DeviceType D, typename T>
struct SpaceToBatchFunctor {
SpaceToBatchFunctor(const bool b2s = false): b2s_(b2s){}
void operator()(Tensor *input_tensor,
const Tensor *block_shape_tensor,
const Tensor *paddings_tensor,
Tensor *output_tensor) {
MACE_CHECK_NOTNULL(input_tensor);
MACE_CHECK_NOTNULL(block_shape_tensor);
MACE_CHECK_NOTNULL(paddings_tensor);
MACE_CHECK_NOTNULL(output_tensor);
}
bool b2s_;
};
template <>
void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *input_tensor,
const Tensor *block_shape_tensor,
const Tensor *paddings_tensor,
Tensor *output);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_CONV_2D_H_
......@@ -67,12 +67,11 @@ cc_test(
testonly = 1,
srcs = glob(["space_to_batch_test.cc"]),
copts = ["-std=c++11"],
linkopts = if_android(["-pie"]),
linkopts = ["-fopenmp"] + if_android(["-ldl"]),
linkstatic = 1,
deps = [
"//mace/kernels",
"//mace/core",
"//mace/ops:test",
":ops",
":test",
"@gtest//:gtest_main",
],
)
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/batch_to_space.h"
namespace mace {
REGISTER_CPU_OPERATOR(BatchToSpaceND, BatchToSpaceNDOp<DeviceType::CPU, float>);
REGISTER_OPENCL_OPERATOR(BatchToSpaceND, BatchToSpaceNDOp<DeviceType::OPENCL, float>);
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_SPACE_TO_BATCH_H_
#define MACE_OPS_SPACE_TO_BATCH_H_
#include <memory>
#include "mace/core/operator.h"
#include "mace/kernels/space_to_batch.h"
namespace mace {
static void BatchToSpaceHelper(const Tensor *input_tensor,
const Tensor *block_shape_tensor,
const Tensor *cropped_tensor,
Tensor *output) {
MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
MACE_CHECK(block_shape_tensor->dim_size() == 1, "Block's shape should be 1D");
MACE_CHECK(cropped_tensor->dim_size() == 2, "Paddings' shape should be 2D");
const index_t block_dims = block_shape_tensor->dim(0);
MACE_CHECK(block_dims == cropped_tensor->dim(0) && 2 == cropped_tensor->dim(1));
Tensor::MappingGuard block_shape_tensor_mapper(block_shape_tensor);
Tensor::MappingGuard cropped_tensor_mapper(cropped_tensor);
const int *block_shape_ptr = block_shape_tensor->data<int>();
const int *cropped_ptr = cropped_tensor->data<int>();
std::vector<index_t> output_shape(4, 0);
index_t block_shape_product = 1;
for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) {
MACE_CHECK(block_shape_ptr[block_dim] > 1, "block_shape's value should be great to 1");
const index_t block_shape_value = block_shape_ptr[block_dim];
const index_t cropped_input_size = input_tensor->dim(block_dim + 2) * block_shape_value
- *cropped_ptr
- *(cropped_ptr+1);
MACE_CHECK(cropped_input_size >= 0,
"cropped size must be non-negative");
block_shape_product *= block_shape_value;
output_shape[block_dim+2] = cropped_input_size;
cropped_ptr += 2;
}
output_shape[0] = input_tensor->dim(0) / block_shape_product;
output_shape[1] = input_tensor->dim(1);
output->Resize(output_shape);
}
template <DeviceType D, typename T>
class BatchToSpaceNDOp: public Operator<D, T> {
public:
BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), functor_(true) {}
bool Run() override {
const Tensor *input_tensor = this->Input(INPUT);
const Tensor *block_shape_tensor = this->Input(BLOCK_SHAPE);
const Tensor *cropped_tensor = this->Input(CROPS);
Tensor *output = this->Output(OUTPUT);
BatchToSpaceHelper(input_tensor, block_shape_tensor, cropped_tensor, output);
functor_(output, block_shape_tensor, cropped_tensor, const_cast<Tensor*>(input_tensor));
return true;
}
private:
kernels::SpaceToBatchFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, BLOCK_SHAPE, CROPS);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_SPACE_TO_BATCH_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
template <DeviceType D, typename T>
static void BMBatchToSpace(
int iters, int batch, int channels, int height, int width) {
mace::testing::StopTiming();
OpsTestNet net;
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("Input")
.Input("BlockShape")
.Input("Crops")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
net.AddInputFromArray<D, int>(
"BlockShape", {2}, {2, 2});
net.AddInputFromArray<D, int>("Crops", {2, 2}, {0,1,0,1});
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
}
net.Sync();
}
#define BM_BATCH_TO_SPACE_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_BATCH_TO_SPACE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_BATCH_TO_SPACE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_BATCH_TO_SPACE(N, C, H, W, TYPE) \
BM_BATCH_TO_SPACE_MACRO(N, C, H, W, TYPE, OPENCL);
BM_BATCH_TO_SPACE(128, 128, 8, 8, float);
} // namespace mace
\ No newline at end of file
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/ops_test_util.h"
#include "mace/kernels/conv_pool_2d_util.h"
using namespace mace;
class AtrousConv2dOpTest : public OpsTestBase {};
static void UpSampleFilter(const std::vector<index_t> &filter_shape,
const std::vector<float> &filter_data,
const int dilation_rate,
std::vector<index_t> &upsampled_filter_shape,
std::vector<float> &upsampled_filter_data) {
upsampled_filter_shape[0] = filter_shape[0];
upsampled_filter_shape[1] = filter_shape[1];
upsampled_filter_shape[2] = filter_shape[2] + (filter_shape[2] - 1) * (dilation_rate - 1);
upsampled_filter_shape[3] = filter_shape[3] + (filter_shape[3] - 1) * (dilation_rate - 1);
const index_t upsampled_filter_size = std::accumulate(upsampled_filter_shape.begin(),
upsampled_filter_shape.end(),
1, std::multiplies<index_t>());
upsampled_filter_data.resize(upsampled_filter_size, 0);
index_t filter_idx = 0;
index_t upsampled_filter_idx = 0;
for (index_t n = 0; n < filter_shape[0]; ++n) {
for (index_t c = 0; c < filter_shape[1]; ++c) {
for (index_t h = 0; h < filter_shape[2]; ++h) {
for (index_t w = 0; w < filter_shape[3]; ++w) {
upsampled_filter_data[upsampled_filter_idx] = filter_data[filter_idx];
filter_idx += 1;
upsampled_filter_idx += dilation_rate;
}
upsampled_filter_idx += 1 - dilation_rate + (dilation_rate-1) * upsampled_filter_shape[3];
}
upsampled_filter_idx -= (dilation_rate-1) * upsampled_filter_shape[3];
}
}
}
template <DeviceType D>
static void RunConv2D(const std::vector<index_t> &input_shape,
const std::vector<float> &input_data,
const std::vector<index_t> &filter_shape,
const std::vector<float> &filter_data,
const std::vector<index_t> &bias_shape,
const std::vector<float> &bias_data,
const int dilation_h,
const int dilation_w,
Padding padding,
Tensor *result) {
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilation_h, dilation_w})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>(
"Input", input_shape, input_data);
net.AddInputFromArray<D, float>(
"Filter", filter_shape, filter_data);
net.AddInputFromArray<D, float>("Bias", bias_shape, bias_data);
// Run
net.RunOp(D);
// Check
result->Copy(*net.GetOutput("Output"));
}
template <DeviceType D>
static void GenerateAndRunConv2D(const index_t batch,
const index_t input_channels,
const index_t height,
const index_t width,
const index_t output_channels,
const index_t kernel_h,
const index_t kernel_w,
Padding padding,
const int dilation_rate) {
srand(time(NULL));
// Add input data
std::vector<index_t> input_shape = {batch, input_channels, height, width};
std::vector<float> input_data;
GenerateRandomRealTypeData<float>(input_shape, input_data);
std::vector<index_t> filter_shape = {output_channels, input_channels, kernel_h, kernel_w};
std::vector<float> filter_data;
GenerateRandomRealTypeData<float>(filter_shape, filter_data);
std::vector<index_t> bias_shape = {output_channels};
std::vector<float> bias_data;
GenerateRandomRealTypeData<float>(bias_shape, bias_data);
std::vector<index_t> upsampled_filter_shape(4, 0);
std::vector<float> upsampled_filter_data;
UpSampleFilter(filter_shape, filter_data, dilation_rate,
upsampled_filter_shape, upsampled_filter_data);
Tensor expected_result;
// Run on cpu
RunConv2D<DeviceType::CPU>(input_shape, input_data,
upsampled_filter_shape, upsampled_filter_data,
bias_shape, bias_data,
1, 1,
padding, &expected_result);
Tensor device_result(GetDeviceAllocator(D), DataTypeToEnum<float>::v());
// run on device
RunConv2D<D>(input_shape, input_data,
filter_shape, filter_data,
bias_shape, bias_data,
dilation_rate, dilation_rate,
padding, &device_result);
ExpectTensorNear<float>(expected_result, device_result, 0.001);
}
template <DeviceType D>
static void TestSimple(const int kernel_h,
const int kernel_w,
Padding padding,
const int dilation_rate) {
GenerateAndRunConv2D<D>(1, 3, 5, 5, 1, kernel_h, kernel_w, padding, dilation_rate);
}
TEST_F(AtrousConv2dOpTest, CPUSimple) {
for (int i = 2 ; i < 4; ++i) {
TestSimple<DeviceType::CPU>(3, 3, VALID, i);
TestSimple<DeviceType::CPU>(3, 3, SAME, i);
}
}
TEST_F(AtrousConv2dOpTest, OPENCLSimple) {
for (int i = 2 ; i < 3; ++i) {
TestSimple<DeviceType::OPENCL>(3, 3, VALID, i);
}
}
template <DeviceType D>
static void TestAligned(const int kernel_h,
const int kernel_w,
Padding padding,
const int dilation_rate) {
GenerateAndRunConv2D<D>(3, 64, 32, 32, 128, kernel_h, kernel_w, padding, dilation_rate);
}
template <DeviceType D>
static void TestUnAligned(const int kernel_h,
const int kernel_w,
Padding padding,
const int dilation_rate) {
srand(time(NULL));
// generate random input
index_t batch = 3 + rand() % 10;
index_t input_channels = 3 + rand() % 10;
index_t height = 107;
index_t width = 113;
index_t output_channels = 3 + rand() % 10;
GenerateAndRunConv2D<D>(batch, input_channels, height, width, output_channels,
kernel_h, kernel_w, padding, dilation_rate);
}
TEST_F(AtrousConv2dOpTest, UpSample) {
const int batch = 2;
const int channel = 2;
const int height = 3;
const int width = 3;
const int rate = 2;
std::vector<index_t> filter_shape = {batch, channel, height, width};
std::vector<float> filter_data(batch*channel*height*width, 1);
std::vector<index_t> upsampled_filter_shape(4, 0);
std::vector<float> upsampled_filter_data;
UpSampleFilter(filter_shape, filter_data, rate,
upsampled_filter_shape, upsampled_filter_data);
int size = std::accumulate(upsampled_filter_shape.begin(), upsampled_filter_shape.end(),
1, std::multiplies<index_t>());
const int expected_size = batch * channel *
(height + (height-1) * (rate - 1)) *
(width + (width-1) * (rate-1));
EXPECT_EQ(expected_size, upsampled_filter_data.size());
}
TEST_F(AtrousConv2dOpTest, CPUAligned) {
for (int i = 2 ; i < 4; ++i) {
TestAligned<DeviceType::CPU>(3, 3, VALID, i);
TestAligned<DeviceType::CPU>(3, 3, SAME, i);
}
}
TEST_F(AtrousConv2dOpTest, OPENCLAligned) {
for (int i = 2 ; i < 4; ++i) {
TestAligned<DeviceType::OPENCL>(3, 3, VALID, i);
TestAligned<DeviceType::OPENCL>(3, 3, SAME, i);
}
}
TEST_F(AtrousConv2dOpTest, CPUUnAligned) {
for (int i = 2 ; i < 4; ++i) {
TestUnAligned<DeviceType::CPU>(3, 3, VALID, i);
TestUnAligned<DeviceType::CPU>(3, 3, SAME, i);
}
}
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/space_to_batch.h"
namespace mace {
REGISTER_CPU_OPERATOR(SpaceToBatchND, SpaceToBatchNDOp<DeviceType::CPU, float>);
REGISTER_OPENCL_OPERATOR(SpaceToBatchND, SpaceToBatchNDOp<DeviceType::OPENCL, float>);
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_SPACE_TO_BATCH_H_
#define MACE_OPS_SPACE_TO_BATCH_H_
#include <memory>
#include "mace/core/operator.h"
#include "mace/kernels/space_to_batch.h"
namespace mace {
static void SpaceToBatchHelper(const Tensor *input_tensor,
const Tensor *block_shape_tensor,
const Tensor *paddings_tensor,
Tensor *output) {
MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
MACE_CHECK(block_shape_tensor->dim_size() == 1, "Block's shape should be 1D");
MACE_CHECK(paddings_tensor->dim_size() == 2, "Paddings' shape should be 2D");
const index_t block_dims = block_shape_tensor->dim(0);
MACE_CHECK(block_dims == paddings_tensor->dim(0) && 2 == paddings_tensor->dim(1));
Tensor::MappingGuard block_shape_tensor_mapper(block_shape_tensor);
Tensor::MappingGuard padding_tensor_mapper(paddings_tensor);
const int *block_shape_ptr = block_shape_tensor->data<int>();
const int *paddings_ptr = paddings_tensor->data<int>();
std::vector<index_t> output_shape(4, 0);
index_t block_shape_product = 1;
for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) {
MACE_CHECK(block_shape_ptr[block_dim] > 1, "block_shape's value should be great to 1");
const index_t block_shape_value = block_shape_ptr[block_dim];
const index_t padded_input_size = input_tensor->dim(block_dim + 2)
+ *paddings_ptr
+ *(paddings_ptr+1);
MACE_CHECK(padded_input_size % block_shape_value == 0,
"padded input is not divisible by block_shape");
block_shape_product *= block_shape_value;
output_shape[block_dim+2] = padded_input_size / block_shape_value;
paddings_ptr += 2;
}
output_shape[0] = input_tensor->dim(0) * block_shape_product;
output_shape[1] = input_tensor->dim(1);
output->Resize(output_shape);
}
template <DeviceType D, typename T>
class SpaceToBatchNDOp : public Operator<D, T> {
public:
SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {}
bool Run() override {
const Tensor *input_tensor = this->Input(INPUT);
const Tensor *block_shape_tensor = this->Input(BLOCK_SHAPE);
const Tensor *paddings_tensor = this->Input(PADDINGS);
Tensor *output = this->Output(OUTPUT);
SpaceToBatchHelper(input_tensor, block_shape_tensor, paddings_tensor, output);
functor_(const_cast<Tensor*>(input_tensor), block_shape_tensor, paddings_tensor, output);
return true;
}
private:
kernels::SpaceToBatchFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, BLOCK_SHAPE, PADDINGS);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_SPACE_TO_BATCH_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
template <DeviceType D, typename T>
static void BMSpaceToBatch(
int iters, int batch, int channels, int height, int width) {
mace::testing::StopTiming();
OpsTestNet net;
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("Input")
.Input("BlockShape")
.Input("Padding")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
net.AddInputFromArray<D, int>(
"BlockShape", {2}, {2, 2});
net.AddInputFromArray<D, int>("Padding", {2, 2}, {2,3,2,3});
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
}
net.Sync();
}
#define BM_SPACE_TO_BATCH_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_SPACE_TO_BATCH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSpaceToBatch<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_SPACE_TO_BATCH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_SPACE_TO_BATCH(N, C, H, W, TYPE) \
BM_SPACE_TO_BATCH_MACRO(N, C, H, W, TYPE, OPENCL);
BM_SPACE_TO_BATCH(128, 128, 15, 15, float);
} // namespace mace
\ No newline at end of file
......@@ -2,30 +2,92 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/opencl/space_to_batch.h"
#include "gtest/gtest.h"
#include "mace/ops/ops_test_util.h"
using namespace mace;
template <DeviceType D>
void RunSpaceToBatch(const std::vector<index_t> &input_shape,
const std::vector<float> &input_data,
const std::vector<index_t> &block_shape_shape,
const std::vector<int> &block_shape_data,
const std::vector<index_t> &padding_shape,
const std::vector<int> &padding_data,
const Tensor *expected) {
OpsTestNet net;
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("Input")
.Input("BlockShape")
.Input("Padding")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>(
"Input", input_shape, input_data);
net.AddInputFromArray<D, int>(
"BlockShape", block_shape_shape, block_shape_data);
net.AddInputFromArray<D, int>("Padding", padding_shape, padding_data);
// Run
net.RunOp(D);
// Check
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-8);
}
template <DeviceType D>
void RunBatchToSpace(const std::vector<index_t> &input_shape,
const std::vector<float> &input_data,
const std::vector<index_t> &block_shape_shape,
const std::vector<int> &block_shape_data,
const std::vector<index_t> &crops_shape,
const std::vector<int> &crops_data,
const Tensor *expected) {
OpsTestNet net;
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("Input")
.Input("BlockShape")
.Input("Crops")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>(
"Input", input_shape, input_data);
net.AddInputFromArray<D, int>(
"BlockShape", block_shape_shape, block_shape_data);
net.AddInputFromArray<D, int>("Crops", crops_shape, crops_data);
// Run
net.RunOp(D);
// Check
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-8);
}
template <typename T>
void TestBidirectionTransform(const std::vector<index_t> &space_shape,
const std::vector<float> &space,
const int block_height,
const int block_width,
const std::vector<float> &space_data,
const std::vector<index_t> &block_shape,
const std::vector<int> &block_data,
const std::vector<index_t> &padding_shape,
const std::vector<int> &padding_data,
const std::vector<index_t> &batch_shape,
const std::vector<float> &batch) {
const std::vector<float> &batch_data) {
auto space_tensor = unique_ptr<Tensor>(new Tensor(GetDeviceAllocator(DeviceType::OPENCL),
DataTypeToEnum<T>::v()));
space_tensor->Resize(space_shape);
{
Tensor::MappingGuard space_mapper(space_tensor.get());
T *space_data = space_tensor->mutable_data<T>();
MACE_CHECK(static_cast<size_t>(space_tensor->size()) == space.size())
T *space_ptr = space_tensor->mutable_data<T>();
MACE_CHECK(static_cast<size_t>(space_tensor->size()) == space_data.size())
<< "Space tensor size:" << space_tensor->size()
<< ", space data size:" << space.size();
memcpy(space_data, space.data(), space.size() * sizeof(T));
<< ", space data size:" << space_data.size();
memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(T));
}
auto batch_tensor = unique_ptr<Tensor>(new Tensor(GetDeviceAllocator(DeviceType::OPENCL),
......@@ -33,45 +95,65 @@ void TestBidirectionTransform(const std::vector<index_t> &space_shape,
batch_tensor->Resize(batch_shape);
{
Tensor::MappingGuard batch_mapper(batch_tensor.get());
T *batch_data = batch_tensor->mutable_data<T>();
MACE_CHECK(static_cast<size_t>(batch_tensor->size()) == batch.size());
memcpy(batch_data, batch.data(), batch.size() * sizeof(T));
T *batch_ptr = batch_tensor->mutable_data<T>();
MACE_CHECK(static_cast<size_t>(batch_tensor->size()) == batch_data.size());
memcpy(batch_ptr, batch_data.data(), batch_data.size() * sizeof(T));
}
auto inner_batch_tensor = unique_ptr<Tensor>(new Tensor(GetDeviceAllocator(DeviceType::OPENCL),
DataTypeToEnum<T>::v()));
inner_batch_tensor->Resize(batch_shape);
kernels::SpaceToBatch(space_tensor.get(), block_height, block_width,
inner_batch_tensor.get(), nullptr, nullptr);
ExpectTensorNear<float>(*batch_tensor, *inner_batch_tensor, 1e-8);
auto inner_space_tensor = unique_ptr<Tensor>(new Tensor(GetDeviceAllocator(DeviceType::OPENCL),
DataTypeToEnum<T>::v()));
inner_space_tensor->Resize(space_shape);
kernels::SpaceToBatch<true>(inner_space_tensor.get(), block_height, block_width,
batch_tensor.get(), nullptr, nullptr);
ExpectTensorNear<float>(*space_tensor, *inner_space_tensor, 1e-8);
RunSpaceToBatch<DeviceType::OPENCL>(space_shape, space_data,
block_shape, block_data,
padding_shape, padding_data,
batch_tensor.get());
RunBatchToSpace<DeviceType::OPENCL>(batch_shape, batch_data,
block_shape, block_data,
padding_shape, padding_data,
space_tensor.get());
}
TEST(SpaceToBatchTest, NoTransform) {
TEST(SpaceToBatchTest, SmallData) {
TestBidirectionTransform<float>({1, 1, 2, 2},
{1,2,3,4},
1, 1,
{1,1,2,2},
{1,2,3,4});
{2},
{2, 2},
{2, 2},
{0, 0, 0, 0},
{4,1,1,1},
{1,2,3,4}
);
}
TEST(SpaceToBatchTest, SmallData) {
TEST(SpaceToBatchTest, SmallDataWithOnePadding) {
TestBidirectionTransform<float>({1, 1, 2, 2},
{1,2,3,4},
2, 2,
{4,1,1,1},
{1,2,3,4});
{2},
{3, 3},
{2, 2},
{1, 0, 1, 0},
{9,1,1,1},
{0,0,0,0,1,2,0,3,4}
);
}
TEST(SpaceToBatchTest, SmallDataWithTwoPadding) {
TestBidirectionTransform<float>({1, 1, 2, 2},
{1,2,3,4},
{2},
{2, 2},
{2, 2},
{1, 1, 1, 1},
{4,1,2,2},
{0,0,0,4,0,0,3,0,0,2,0,0,1,0,0,0}
);
}
TEST(SpaceToBatchTest, MultiChannelData) {
TestBidirectionTransform<float>({1, 3, 2, 2},
{1,2,3,4,5,6,7,8,9,10,11,12},
2, 2,
{2},
{2, 2},
{2, 2},
{0, 0, 0, 0},
{4,3,1,1},
{1,5,9,2,6,10,3,7,11,4,8,12}
);
......@@ -80,7 +162,10 @@ TEST(SpaceToBatchTest, MultiChannelData) {
TEST(SpaceToBatchTest, LargerMultiChannelData) {
TestBidirectionTransform<float>({1, 1, 4, 4},
{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
2, 2,
{2},
{2, 2},
{2, 2},
{0, 0, 0, 0},
{4,1,2,2},
{1,3,9,11,2,4,10,12,5,7,13,15,6,8,14,16}
);
......@@ -89,7 +174,10 @@ TEST(SpaceToBatchTest, LargerMultiChannelData) {
TEST(SpaceToBatchTest, MultiBatchData) {
TestBidirectionTransform<float>({2, 1, 2, 4},
{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
2, 2,
{2},
{2, 2},
{2, 2},
{0, 0, 0, 0},
{8,1,1,2},
{1,3,2,4,5,7,6,8,9,11,10,12,13,15,14,16}
);
......@@ -99,7 +187,10 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
TestBidirectionTransform<float>({2, 2, 2, 4},
{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,
17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},
2, 2,
{2},
{2, 2},
{2, 2},
{0, 0, 0, 0},
{8,2,1,2},
{1,3,9,11,2,4,10,12,5,7,13,15,6,8,14,16,
17,19,25,27,18,20,26,28,21,23,29,31,22,24,30,32}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册