提交 c899d438 编写于 作者: L liuqi

Finish max/avg pooling opencl kernel.

上级 e19514a9
......@@ -58,19 +58,27 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor,
Tensor *output_tensor) {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape(4);
filter_shape[0] = input_tensor->shape()[1];
filter_shape[1] = input_tensor->shape()[1];
filter_shape[2] = kernels_[0];
filter_shape[3] = kernels_[1];
kernels::CalcPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), this->dilations_,
strides_, this->padding_, output_shape.data(),
paddings.data());
output_tensor->Resize(output_shape);
const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>();
const index_t *input_shape = input_tensor->shape().data();
const index_t *output_shape = output_tensor->shape().data();
int paddings[2];
std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
#ifdef __COPY_MAKE_PADDING
Tensor padded_input;
ConstructInputWithPadding(input_tensor, paddings, &padded_input);
ConstructInputWithPadding(input_tensor, paddings.data(), &padded_input);
input = padded_input.data<float>();
input_shape = padded_input.shape().data();
#endif
......@@ -80,17 +88,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
// kernel_size: 2x2, strides: 2x2
if (pooling_type_ == MAX) { // MAX_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING
PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
#else
PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape,
paddings);
PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
paddings.data());
#endif
} else { // AVG_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
#else
PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape,
paddings);
PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
paddings.data());
#endif
}
} else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 &&
......@@ -98,17 +106,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
// kernel_size: 3x3, strides: 2x2
if (pooling_type_ == MAX) { // MAX_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING
PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
#else
PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape,
paddings);
PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
paddings.data());
#endif
} else { // AVG_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
#else
PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape,
paddings);
PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
paddings.data());
#endif
}
} else { // not implement yet
......
......@@ -18,4 +18,7 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
......@@ -57,15 +57,14 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
// Unrolling this loop hurt perfmance
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \
......
......@@ -59,15 +59,13 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
// Unrolling this loop hurt perfmance
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
for (short width_idx = 0; width_idx < 3; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \
......
#include <common.h>
VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) {
VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4);
VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width);
VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4);
VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2);
VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4);
VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323);
VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01);
VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323);
VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01);
VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323);
VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01);
VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
(VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
return res;
}
VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) {
VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr);
DATA_TYPE row01 = *(input_ptr + 8);
VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width);
DATA_TYPE row11 = *(input_ptr + in_width + 8);
VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2);
DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8);
VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01);
VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11);
VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21);
VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
(VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
return res;
}
DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) {
VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width);
VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2);
VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2);
DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2);
return res;
}
// Supported data type: half/float
__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */
__private const int in_height,
__private const int in_width,
__private const int out_chan_num,
__private const int out_height,
__private const int out_width,
__private const int stride,
__global DATA_TYPE *output) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const int round_out_width = (out_width + 3) / 4;
const int out_pixel_height = out_pixel_blk / round_out_width;
const int out_pixel_width = out_pixel_blk % round_out_width;
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const int in_offset = batch * out_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
const int pixels = out_pixel_end - out_pixel_begin;
for (int i = out_chan_begin; i < out_chan_end; ++i) {
const DATA_TYPE *input_ptr = input_base + i * in_pixel;
DATA_TYPE *output_ptr = output_base + i * out_pixel;
if (pixels == 4) {
VEC_DATA_TYPE(DATA_TYPE,4) res;
#ifdef STRIDE_1
res = vec_pooling_3_s1(input_ptr, in_width);
#ifdef FP16
#define MIN_VALUE -HALF_MAX
#else
res = vec_pooling_3_s2(input_ptr, in_width);
#define MIN_VALUE -FLT_MAX
#endif
vstore4(res, 0, output_ptr);
} else {
for (int p = 0; p < pixels; ++p) {
output_ptr[p] = inner_pooling_3(input_ptr, in_width);
input_ptr += stride;
}
}
}
}
int calculate_avg_block_size(const int pos_h,
const int pos_w,
const int pool_size,
const int pad_h,
const int pad_w,
const int h_size,
const int w_size) {
const int h_start = max(0, pos_h - pad_h);
const int w_start = max(0, pos_w - pad_w);
const int h_end = min(pos_h + pool_size - pad_h, h_size);
const int w_end = min(pos_w + pool_size - pad_w, w_size);
inline int calculate_avg_block_size(const int pool_size,
const int pos_h,
const int pos_w,
const int h_size,
const int w_size) {
const int h_start = max(0, pos_h);
const int w_start = max(0, pos_w);
const int h_end = min(pos_h + pool_size, h_size);
const int w_end = min(pos_w + pool_size, w_size);
return (h_end - h_start) * (w_end - w_start);
}
// Supported data type: half/float
__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */
__private const int in_height,
__private const int in_width,
__private const int out_chan_num,
__private const int out_height,
__private const int out_width,
__private const int stride,
__private const int pad_h,
__private const int pad_w,
__private const int pooling_size,
__global DATA_TYPE *output) {
int batch = get_global_id(0);
int out_chan_idx = get_global_id(1);
int out_pixel_idx = get_global_id(2);
const int out_pixel_height = out_pixel_idx / out_width;
const int out_pixel_width = out_pixel_idx % out_width;
const int out_chan_begin = out_chan_idx * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int in_pixel_idx = out_pixel_height * stride * in_width
+ out_pixel_width * stride;
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const int in_offset = batch * out_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const DATA_TYPE *input_base = input + in_offset + in_pixel_idx;
DATA_TYPE *output_base = output + out_offset + out_pixel_idx;
const int block_size = calculate_avg_block_size(
out_pixel_height * stride,
out_pixel_width * stride,
pooling_size,
pad_h/2,
pad_w/2,
in_height - pad_h,
in_width - pad_w);
for (int i = out_chan_begin; i < out_chan_end; ++i) {
VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f;
DATA_TYPE sum1 = 0.0f;
DATA_TYPE *output_ptr = output_base + i * out_pixel;
for (int y = 0; y < pooling_size; ++y) {
const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width;
int x = 0;
for (; x < (pooling_size-8); x += 8) {
VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr);
sum8 += data;
input_ptr += 8;
}
for (; x < pooling_size; ++x) {
sum1 += *input_ptr;
input_ptr++;
__kernel void pooling(__read_only image2d_t input,
__private const int in_height,
__private const int in_width,
__private const int out_height,
__private const int pad_top,
__private const int pad_left,
__private const int stride,
__private const int pooling_size,
__write_only image2d_t output) {
const int out_chan_idx = get_global_id(0);
const int out_width_idx = get_global_id(1);
const int out_width = get_global_size(1);
const int out_hb_idx = get_global_id(2);
const int batch_idx = (out_hb_idx / out_height) * in_height;
const int in_height_start = (out_hb_idx % out_height) * stride - pad_top;
const int in_width_start = out_width_idx * stride - pad_left;
const int in_channel_offset = out_chan_idx * in_width;
#ifdef POOL_AVG
DATA_TYPE4 res = 0;
for (int height = 0; height < pooling_size; ++height) {
int in_height_idx = in_height_start + height;
in_height_idx = select(batch_idx + in_height_idx,
-1,
(in_height_idx < 0 || in_height_idx >= in_height));
for (int width = 0; width < pooling_size; ++width) {
int in_width_idx = in_width_start + width;
in_width_idx = select(in_channel_offset + in_width_idx,
-1,
(in_width_idx < 0 || in_width_idx >= in_width));
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
res = res + in;
}
}
const int block_size = calculate_avg_block_size(pooling_size,
in_height_start, in_width_start,
in_height, in_width);
res /= block_size;
#else
DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE);
for (int height = 0; height < pooling_size; ++height) {
int in_height_idx = in_height_start + height;
in_height_idx = select(batch_idx + in_height_idx,
-1,
(in_height_idx < 0 || in_height_idx >= in_height));
if (in_height_idx != -1) {
for (int width = 0; width < pooling_size; ++width) {
int in_width_idx = in_width_start + width;
in_width_idx = select(in_channel_offset + in_width_idx,
-1,
(in_width_idx < 0 || in_width_idx >= in_width));
if (in_width_idx != -1) {
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
res = fmax(res, in);
}
}
}
VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567;
VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23;
*output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size;
}
#endif
WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res);
}
......@@ -38,7 +38,6 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
auto program = runtime->program();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
......
......@@ -10,131 +10,94 @@
namespace mace {
namespace kernels {
static void Pooling3(const Tensor *input,
const int *stride,
const PoolingType type,
Tensor *output) {
if (type != MAX) {
MACE_NOT_IMPLEMENTED;
}
static void Pooling(const Tensor *input,
const int *stride,
const int *paddings,
const int pooling_size,
const PoolingType type,
const DataType dt,
Tensor *output) {
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t out_height = output->dim(2);
index_t out_width = output->dim(3);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blk = (channels + 3) / 4;
const index_t pixel_width = (out_width + 3) / 4 ;
index_t channel_blocks = (channels + 3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(batch),
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(pixel_width * out_height),
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : "");
auto pooling_kernel = runtime->BuildKernel("pooling", "pooling3", built_options);
if (type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
}
if (type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
const uint32_t lws[3] = {1, 8, 128};
uint32_t lws[3];
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
uint32_t idx = 0;
pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
pooling_kernel.setArg(idx++, paddings[0] / 2);
pooling_kernel.setArg(idx++, paddings[1] / 2);
pooling_kernel.setArg(idx++, stride[0]);
pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
pooling_kernel.setArg(idx++, pooling_size);
pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
MACE_CHECK(error == CL_SUCCESS) << error;
}
static void PoolingN(const Tensor *input,
const int *stride,
const int *paddings,
const int pooling_size,
const PoolingType type,
Tensor *output) {
if (type != AVG) {
MACE_NOT_IMPLEMENTED;
}
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t out_height = output->dim(2);
index_t out_width = output->dim(3);
index_t channel_blk = (channels + 3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(batch),
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(out_height * out_width),
template<typename T>
void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape = {
kernels_[0], kernels_[1],
input->dim(3), input->dim(3)
};
auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
auto pooling_kernel = runtime->BuildKernel("pooling", "poolingn", built_options);
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(),
dilations_, strides_, this->padding_,
output_shape.data(), paddings.data());
const uint32_t lws[3] = {1, 8, 128};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0;
pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
pooling_kernel.setArg(idx++, stride[0]);
pooling_kernel.setArg(idx++, paddings[0]);
pooling_kernel.setArg(idx++, paddings[1]);
pooling_kernel.setArg(idx++, pooling_size);
pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
DataTypeToEnum<T>::value, output);
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
template <>
void PoolingFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
Tensor *output) {
int paddings[2];
std::vector<index_t> filter_shape = {input->dim(1), input->dim(0),
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input->shape().data(), filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
#define POOLING_HELPER \
switch(kernels_[0]) { \
case 3: \
Pooling3(input, strides_, pooling_type_, output); \
break; \
default: \
PoolingN(input, strides_, paddings, kernels_[0], \
pooling_type_, output); \
break; \
}
if (paddings[0] > 0 || paddings[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
ConstructInputWithPadding(input, paddings, &padded_input, pooling_type_ == MAX);
input = &padded_input;
POOLING_HELPER
} else {
POOLING_HELPER
}
#undef POOLING_HELPER
}
template
struct PoolingFunctor<DeviceType::OPENCL, float>;
template
struct PoolingFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -18,36 +18,66 @@ enum PoolingType {
namespace kernels {
template <DeviceType D, typename T>
struct PoolingFunctor {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding padding,
const int *dilations)
struct PoolingFunctorBase {
PoolingFunctorBase(const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding padding,
const int *dilations)
: pooling_type_(pooling_type),
kernels_(kernels),
strides_(strides),
padding_(padding),
dilations_(dilations) {}
const PoolingType pooling_type_;
const int *kernels_;
const int *strides_;
const Padding padding_;
const int *dilations_;
};
template<DeviceType D, typename T>
struct PoolingFunctor : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding padding,
const int *dilations)
: PoolingFunctorBase(pooling_type, kernels,
strides, padding,
dilations) {}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor) {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape = {
kernels_[0], kernels_[1],
input_tensor->dim(3), input_tensor->dim(3)
};
kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(),
dilations_, strides_, this->padding_,
output_shape.data(), paddings.data());
output_tensor->Resize(output_shape);
Tensor::MappingGuard in_guard(input_tensor);
Tensor::MappingGuard out_guard(output_tensor);
const T *input = input_tensor->data<T>();
T *output = output_tensor->mutable_data<T>();
const index_t *input_shape = input_tensor->shape().data();
const index_t *output_shape = output_tensor->shape().data();
index_t batch = output_shape[0];
index_t channels = output_shape[1];
index_t height = output_shape[2];
index_t width = output_shape[3];
index_t height = output_shape[1];
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t out_image_size = height * width;
index_t input_channels = input_shape[1];
index_t input_height = input_shape[2];
index_t input_width = input_shape[3];
index_t input_height = input_shape[1];
index_t input_width = input_shape[2];
index_t input_channels = input_shape[3];
index_t in_image_size = input_height * input_width;
int kernel_h = kernels_[0];
......@@ -59,11 +89,6 @@ struct PoolingFunctor {
int dilation_h = dilations_[0];
int dilation_w = dilations_[1];
int paddings[2];
std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
// The left-upper most offset of the padded input
int padded_h_start = 0 - paddings[0] / 2;
int padded_w_start = 0 - paddings[1] / 2;
......@@ -71,25 +96,24 @@ struct PoolingFunctor {
if (pooling_type_ == MAX) {
#pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) {
for (int c = 0; c < channels; ++c) {
index_t out_offset = (b * channels + c) * out_image_size;
index_t in_offset = (b * input_channels + c) * in_image_size;
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
T max = std::numeric_limits<T>::lowest();
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
index_t in_offset = b * in_image_size * input_channels + c;
T res = std::numeric_limits<T>::lowest();
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
int inh = padded_h_start + h * stride_h + dilation_h * kh;
int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) {
index_t input_offset = in_offset + inh * input_width + inw;
max = std::max(max, input[input_offset]);
index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
res = std::max(res, input[input_offset]);
}
}
}
output[out_offset] = max;
out_offset += 1;
*output = res;
output++;
}
}
}
......@@ -97,11 +121,10 @@ struct PoolingFunctor {
} else if (pooling_type_ == AVG) {
#pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) {
for (int c = 0; c < channels; ++c) {
index_t out_offset = (b * channels + c) * out_image_size;
index_t in_offset = (b * input_channels + c) * in_image_size;
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
index_t in_offset = b * in_image_size * input_channels + c;
T sum = 0;
int block_size = 0;
for (int kh = 0; kh < kernel_h; ++kh) {
......@@ -110,14 +133,14 @@ struct PoolingFunctor {
int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) {
index_t input_offset = in_offset + inh * input_width + inw;
index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
sum += input[input_offset];
block_size += 1;
}
}
}
output[out_offset] = sum / block_size;
out_offset += 1;
*output = sum / block_size;
output++;
}
}
}
......@@ -125,22 +148,26 @@ struct PoolingFunctor {
}
}
const PoolingType pooling_type_;
const int *kernels_;
const int *strides_;
const Padding padding_;
const int *dilations_;
};
template <>
template<>
void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor,
Tensor *output_tensor);
template <>
void PoolingFunctor<DeviceType::OPENCL, float>::operator()(
const Tensor *input_tensor,
Tensor *output_tensor);
template<typename T>
struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding padding,
const int *dilations)
: PoolingFunctorBase(pooling_type, kernels,
strides, padding,
dilations) {}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor);
};
} // namespace kernels
} // namespace mace
......
......@@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape(4);
// TODO(chenghui): is it kind of a hack?
filter_shape[0] = input->shape()[1];
filter_shape[1] = input->shape()[0];
filter_shape[2] = kernels_[0];
filter_shape[3] = kernels_[1];
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), this->dilations_.data(),
this->strides_.data(), this->padding_, output_shape.data(),
paddings.data());
output->Resize(output_shape);
functor_(input, output);
return true;
};
......
......@@ -28,48 +28,20 @@ TEST_F(PoolingOpTest, MAX_VALID) {
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
"Input", {1, 4, 4, 2},
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
// Run
net.RunOp();
// Check
auto expected =
CreateTensor<float>({1, 2, 2, 2}, {5, 7, 13, 15, 21, 23, 29, 31});
CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(PoolingOpTest, AVG_VALID) {
// Construct graph
auto &net = test_net();
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::AVG)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
// Run
net.RunOp();
// Check
auto expected = CreateTensor<float>(
{1, 2, 2, 2}, {2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(PoolingOpTest, MAX_SAME) {
// Construct graph
......@@ -85,14 +57,14 @@ TEST_F(PoolingOpTest, MAX_SAME) {
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 1, 3, 3},
{0, 1, 2, 3, 4, 5, 6, 7, 8});
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8});
// Run
net.RunOp();
// Check
auto expected = CreateTensor<float>({1, 1, 2, 2}, {4, 5, 7, 8});
auto expected = CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
......@@ -112,14 +84,14 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 1, 4, 4},
"Input", {1, 4, 4, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
// Run
net.RunOp();
// Check
auto expected = CreateTensor<float>({1, 1, 2, 2}, {10, 11, 14, 15});
auto expected = CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
......@@ -139,42 +111,57 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 1, 2, 9},
"Input", {1, 2, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
// Run
net.RunOp(DeviceType::NEON);
net.RunOp();
// Check
auto expected = CreateTensor<float>({1, 1, 1, 5}, {10, 12, 14, 16, 17});
auto expected = CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
template <DeviceType D>
template<DeviceType D>
static void SimpleMaxPooling3S2() {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 1, 3, 9},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
"Input", {1, 3, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
// Run
net.RunOp(D);
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
// Run
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
}
// Check
auto expected = CreateTensor<float>({1, 1, 1, 4}, {20, 22, 24, 26});
auto expected = CreateTensor<float>({1, 1, 4, 1}, {20, 22, 24, 26});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
......@@ -182,15 +169,15 @@ static void SimpleMaxPooling3S2() {
TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<CPU>();
}
TEST_F(PoolingOpTest, NEONSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<NEON>();
}
TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<OPENCL>();
}
template <DeviceType D>
static void AlignedMaxPooling3S2(Padding padding) {
template<DeviceType D>
static void MaxPooling3S2(const std::vector<index_t> &input_shape,
const std::vector<int> strides,
Padding padding) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
......@@ -198,22 +185,33 @@ static void AlignedMaxPooling3S2(Padding padding) {
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {3, 128, 64, 64});
// Run
net.RunOp(D);
net.AddRandomInput<D, float>("Input", input_shape);
// run on cpu
net.RunOp();
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on cpu
net.RunOp();
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
}
// TODO(chenghui) : there is a bug.
......@@ -223,152 +221,140 @@ static void AlignedMaxPooling3S2(Padding padding) {
//}
TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) {
AlignedMaxPooling3S2<OPENCL>(Padding::VALID);
AlignedMaxPooling3S2<OPENCL>(Padding::SAME);
MaxPooling3S2<OPENCL>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
MaxPooling3S2<OPENCL>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
MaxPooling3S2<OPENCL>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
MaxPooling3S2<OPENCL>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
}
template <DeviceType D>
static void UnalignedMaxPooling3S2(Padding padding) {
TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
MaxPooling3S2<OPENCL>({3, 41, 43, 47}, {1, 1}, Padding::VALID);
MaxPooling3S2<OPENCL>({3, 41, 43, 47}, {2, 2}, Padding::VALID);
MaxPooling3S2<OPENCL>({3, 41, 43, 47}, {1, 1}, Padding::SAME);
MaxPooling3S2<OPENCL>({3, 41, 43, 47}, {2, 2}, Padding::SAME);
}
TEST_F(PoolingOpTest, AVG_VALID) {
// Construct graph
OpsTestNet net;
auto &net = test_net();
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", padding)
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::AVG)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {3, 113, 43, 47});
// Run
net.RunOp(D);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 4, 4, 2},
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
// Run on cpu
// Run
net.RunOp();
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001);
}
// TODO(chenghui) : there is a bug.
//TEST_F(PoolingOpTest, NEONUnalignedMaxPooling3S2) {
// UnalignedMaxPooling3S2<NEON>();
//}
// Check
auto expected = CreateTensor<float>(
{1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
UnalignedMaxPooling3S2<OPENCL>(Padding::VALID);
UnalignedMaxPooling3S2<OPENCL>(Padding::SAME);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
template <DeviceType D>
template<DeviceType D>
static void SimpleAvgPoolingTest() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 2, 8, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 1, 2, 8},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
// Run
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
// Check
auto expected = CreateTensor<float>({1, 1, 1, 4}, {4.5, 6.5, 8.5, 10.5});
auto expected = CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(PoolingOpTest, NEONSimpleAvgPooling) {
SimpleAvgPoolingTest<NEON>();
}
TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
SimpleAvgPoolingTest<OPENCL>();
}
template <DeviceType D>
static void AlignedAvgPoolingTest(Padding padding) {
template<DeviceType D>
static void AvgPoolingTest(const std::vector<index_t> &shape,
const std::vector<int> &kernels,
const std::vector<int> &strides,
Padding padding) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {4, 4})
.AddIntsArg("strides", {4, 4})
.AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {3, 128, 15, 15});
// Run
net.RunOp(D);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
net.AddRandomInput<D, float>("Input", shape);
// Run on cpu
// run on cpu
net.RunOp();
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5);
}
TEST_F(PoolingOpTest, NEONAlignedAvgPooling) {
AlignedAvgPoolingTest<NEON>(Padding::VALID);
AlignedAvgPoolingTest<NEON>(Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
AlignedAvgPoolingTest<OPENCL>(Padding::VALID);
AlignedAvgPoolingTest<OPENCL>(Padding::SAME);
}
template <DeviceType D>
static void UnAlignedAvgPoolingTest(Padding padding) {
// Construct graph
OpsTestNet net;
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {7, 7})
.AddIntsArg("strides", {7, 7})
.AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {3, 128, 31, 37});
// Run
net.RunOp(D);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
// Run on cpu
net.RunOp();
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
}
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5);
TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
AvgPoolingTest<OPENCL>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
AvgPoolingTest<OPENCL>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
}
TEST_F(PoolingOpTest, NEONUnAlignedAvgPooling) {
UnAlignedAvgPoolingTest<NEON>(Padding::VALID);
UnAlignedAvgPoolingTest<NEON>(Padding::SAME);
TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) {
AvgPoolingTest<OPENCL>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID);
AvgPoolingTest<OPENCL>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) {
UnAlignedAvgPoolingTest<OPENCL>(Padding::VALID);
UnAlignedAvgPoolingTest<OPENCL>(Padding::SAME);
AvgPoolingTest<OPENCL>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::VALID);
AvgPoolingTest<OPENCL>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
AvgPoolingTest<OPENCL>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::VALID);
AvgPoolingTest<OPENCL>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::SAME);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册