提交 c899d438 编写于 作者: L liuqi

Finish max/avg pooling opencl kernel.

上级 e19514a9
...@@ -58,19 +58,27 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()( ...@@ -58,19 +58,27 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor, const Tensor *input_tensor,
Tensor *output_tensor) { Tensor *output_tensor) {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape(4);
filter_shape[0] = input_tensor->shape()[1];
filter_shape[1] = input_tensor->shape()[1];
filter_shape[2] = kernels_[0];
filter_shape[3] = kernels_[1];
kernels::CalcPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), this->dilations_,
strides_, this->padding_, output_shape.data(),
paddings.data());
output_tensor->Resize(output_shape);
const float *input = input_tensor->data<float>(); const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>(); float *output = output_tensor->mutable_data<float>();
const index_t *input_shape = input_tensor->shape().data(); const index_t *input_shape = input_tensor->shape().data();
const index_t *output_shape = output_tensor->shape().data();
int paddings[2];
std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
Tensor padded_input; Tensor padded_input;
ConstructInputWithPadding(input_tensor, paddings, &padded_input); ConstructInputWithPadding(input_tensor, paddings.data(), &padded_input);
input = padded_input.data<float>(); input = padded_input.data<float>();
input_shape = padded_input.shape().data(); input_shape = padded_input.shape().data();
#endif #endif
...@@ -80,17 +88,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()( ...@@ -80,17 +88,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
// kernel_size: 2x2, strides: 2x2 // kernel_size: 2x2, strides: 2x2
if (pooling_type_ == MAX) { // MAX_POOL_2x2s2x2 if (pooling_type_ == MAX) { // MAX_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape); PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
#else #else
PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape, PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
paddings); paddings.data());
#endif #endif
} else { // AVG_POOL_2x2s2x2 } else { // AVG_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape); PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
#else #else
PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape, PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
paddings); paddings.data());
#endif #endif
} }
} else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 && } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 &&
...@@ -98,17 +106,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()( ...@@ -98,17 +106,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
// kernel_size: 3x3, strides: 2x2 // kernel_size: 3x3, strides: 2x2
if (pooling_type_ == MAX) { // MAX_POOL_3x3s2x2 if (pooling_type_ == MAX) { // MAX_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape); PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
#else #else
PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape, PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
paddings); paddings.data());
#endif #endif
} else { // AVG_POOL_3x3s2x2 } else { // AVG_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING #ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape); PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
#else #else
PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape, PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
paddings); paddings.data());
#endif #endif
} }
} else { // not implement yet } else { // not implement yet
......
...@@ -18,4 +18,7 @@ ...@@ -18,4 +18,7 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
...@@ -57,15 +57,14 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -57,15 +57,14 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
// Unrolling this loop hurt perfmance // Unrolling this loop hurt perfmance
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) { for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_hb_value = height_idx + hb_idx; int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx, in_hb_value = select(in_hb_value + batch_idx,
-1, -1,
(in_hb_value < 0 || in_hb_value >= in_height)); (in_hb_value < 0 || in_hb_value >= in_height));
for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_width_value; int in_width_value;
#define READ_INPUT(i) \ #define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \ in_width_value = in_width##i + width_idx; \
......
...@@ -59,15 +59,13 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -59,15 +59,13 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
// Unrolling this loop hurt perfmance // Unrolling this loop hurt perfmance
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (short hb_idx = 0; hb_idx < 3; ++hb_idx) { for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
for (short width_idx = 0; width_idx < 3; ++width_idx) { for (short width_idx = 0; width_idx < 3; ++width_idx) {
in_idx = in_ch_blk * in_width; in_idx = in_ch_blk * in_width;
int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
int in_width_value; int in_width_value;
#define READ_INPUT(i) \ #define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \ in_width_value = in_width##i + width_idx; \
......
#include <common.h> #include <common.h>
VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) { #ifdef FP16
VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr); #define MIN_VALUE -HALF_MAX
VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4);
VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width);
VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4);
VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2);
VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4);
VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323);
VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01);
VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323);
VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01);
VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323);
VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01);
VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
(VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
return res;
}
VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) {
VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr);
DATA_TYPE row01 = *(input_ptr + 8);
VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width);
DATA_TYPE row11 = *(input_ptr + in_width + 8);
VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2);
DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8);
VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01);
VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11);
VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21);
VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
(VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
return res;
}
DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) {
VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width);
VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2);
VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2);
DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2);
return res;
}
// Supported data type: half/float
__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */
__private const int in_height,
__private const int in_width,
__private const int out_chan_num,
__private const int out_height,
__private const int out_width,
__private const int stride,
__global DATA_TYPE *output) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const int round_out_width = (out_width + 3) / 4;
const int out_pixel_height = out_pixel_blk / round_out_width;
const int out_pixel_width = out_pixel_blk % round_out_width;
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const int in_offset = batch * out_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
const int pixels = out_pixel_end - out_pixel_begin;
for (int i = out_chan_begin; i < out_chan_end; ++i) {
const DATA_TYPE *input_ptr = input_base + i * in_pixel;
DATA_TYPE *output_ptr = output_base + i * out_pixel;
if (pixels == 4) {
VEC_DATA_TYPE(DATA_TYPE,4) res;
#ifdef STRIDE_1
res = vec_pooling_3_s1(input_ptr, in_width);
#else #else
res = vec_pooling_3_s2(input_ptr, in_width); #define MIN_VALUE -FLT_MAX
#endif #endif
vstore4(res, 0, output_ptr);
} else {
for (int p = 0; p < pixels; ++p) {
output_ptr[p] = inner_pooling_3(input_ptr, in_width);
input_ptr += stride;
}
}
}
}
int calculate_avg_block_size(const int pos_h, inline int calculate_avg_block_size(const int pool_size,
const int pos_w, const int pos_h,
const int pool_size, const int pos_w,
const int pad_h, const int h_size,
const int pad_w, const int w_size) {
const int h_size, const int h_start = max(0, pos_h);
const int w_size) { const int w_start = max(0, pos_w);
const int h_start = max(0, pos_h - pad_h); const int h_end = min(pos_h + pool_size, h_size);
const int w_start = max(0, pos_w - pad_w); const int w_end = min(pos_w + pool_size, w_size);
const int h_end = min(pos_h + pool_size - pad_h, h_size);
const int w_end = min(pos_w + pool_size - pad_w, w_size);
return (h_end - h_start) * (w_end - w_start); return (h_end - h_start) * (w_end - w_start);
} }
// Supported data type: half/float // Supported data type: half/float
__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */ __kernel void pooling(__read_only image2d_t input,
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
__private const int out_chan_num, __private const int out_height,
__private const int out_height, __private const int pad_top,
__private const int out_width, __private const int pad_left,
__private const int stride, __private const int stride,
__private const int pad_h, __private const int pooling_size,
__private const int pad_w, __write_only image2d_t output) {
__private const int pooling_size, const int out_chan_idx = get_global_id(0);
__global DATA_TYPE *output) { const int out_width_idx = get_global_id(1);
int batch = get_global_id(0); const int out_width = get_global_size(1);
int out_chan_idx = get_global_id(1); const int out_hb_idx = get_global_id(2);
int out_pixel_idx = get_global_id(2);
const int batch_idx = (out_hb_idx / out_height) * in_height;
const int out_pixel_height = out_pixel_idx / out_width; const int in_height_start = (out_hb_idx % out_height) * stride - pad_top;
const int out_pixel_width = out_pixel_idx % out_width; const int in_width_start = out_width_idx * stride - pad_left;
const int in_channel_offset = out_chan_idx * in_width;
const int out_chan_begin = out_chan_idx * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int in_pixel_idx = out_pixel_height * stride * in_width #ifdef POOL_AVG
+ out_pixel_width * stride; DATA_TYPE4 res = 0;
for (int height = 0; height < pooling_size; ++height) {
const int in_pixel = in_height * in_width; int in_height_idx = in_height_start + height;
const int out_pixel = out_height * out_width; in_height_idx = select(batch_idx + in_height_idx,
-1,
const int in_offset = batch * out_chan_num * in_pixel; (in_height_idx < 0 || in_height_idx >= in_height));
const int out_offset = batch * out_chan_num * out_pixel; for (int width = 0; width < pooling_size; ++width) {
const DATA_TYPE *input_base = input + in_offset + in_pixel_idx; int in_width_idx = in_width_start + width;
DATA_TYPE *output_base = output + out_offset + out_pixel_idx; in_width_idx = select(in_channel_offset + in_width_idx,
-1,
const int block_size = calculate_avg_block_size( (in_width_idx < 0 || in_width_idx >= in_width));
out_pixel_height * stride,
out_pixel_width * stride, DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
pooling_size, res = res + in;
pad_h/2, }
pad_w/2, }
in_height - pad_h, const int block_size = calculate_avg_block_size(pooling_size,
in_width - pad_w); in_height_start, in_width_start,
for (int i = out_chan_begin; i < out_chan_end; ++i) { in_height, in_width);
VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f; res /= block_size;
DATA_TYPE sum1 = 0.0f; #else
DATA_TYPE *output_ptr = output_base + i * out_pixel; DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE);
for (int y = 0; y < pooling_size; ++y) { for (int height = 0; height < pooling_size; ++height) {
const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width; int in_height_idx = in_height_start + height;
int x = 0; in_height_idx = select(batch_idx + in_height_idx,
for (; x < (pooling_size-8); x += 8) { -1,
VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr); (in_height_idx < 0 || in_height_idx >= in_height));
sum8 += data; if (in_height_idx != -1) {
input_ptr += 8; for (int width = 0; width < pooling_size; ++width) {
} int in_width_idx = in_width_start + width;
for (; x < pooling_size; ++x) { in_width_idx = select(in_channel_offset + in_width_idx,
sum1 += *input_ptr; -1,
input_ptr++; (in_width_idx < 0 || in_width_idx >= in_width));
if (in_width_idx != -1) {
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
res = fmax(res, in);
}
} }
} }
VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567;
VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23;
*output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size;
} }
#endif
WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res);
} }
...@@ -38,7 +38,6 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, ...@@ -38,7 +38,6 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
auto program = runtime->program(); auto program = runtime->program();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options); auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0; uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer()))); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
......
...@@ -10,131 +10,94 @@ ...@@ -10,131 +10,94 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
static void Pooling3(const Tensor *input, static void Pooling(const Tensor *input,
const int *stride, const int *stride,
const PoolingType type, const int *paddings,
Tensor *output) { const int pooling_size,
if (type != MAX) { const PoolingType type,
MACE_NOT_IMPLEMENTED; const DataType dt,
} Tensor *output) {
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t channels = output->dim(1); index_t out_height = output->dim(1);
index_t out_height = output->dim(2); index_t out_width = output->dim(2);
index_t out_width = output->dim(3); index_t channels = output->dim(3);
index_t channel_blk = (channels + 3) / 4; index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_width = (out_width + 3) / 4 ;
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(batch), static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(pixel_width * out_height), static_cast<uint32_t>(batch * out_height),
}; };
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype())); if (type == MAX && input->dtype() == output->dtype()) {
built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : ""); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
auto pooling_kernel = runtime->BuildKernel("pooling", "pooling3", built_options); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
}
if (type == AVG) {
built_options.emplace("-DPOOL_AVG");
}
auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
const uint32_t lws[3] = {1, 8, 128}; uint32_t lws[3];
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
uint32_t idx = 0; uint32_t idx = 0;
pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2))); pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height)); pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width)); pooling_kernel.setArg(idx++, paddings[0] / 2);
pooling_kernel.setArg(idx++, paddings[1] / 2);
pooling_kernel.setArg(idx++, stride[0]); pooling_kernel.setArg(idx++, stride[0]);
pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); pooling_kernel.setArg(idx++, pooling_size);
pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange, pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS) << error;
} }
static void PoolingN(const Tensor *input, template<typename T>
const int *stride, void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const int *paddings, Tensor *output) {
const int pooling_size, MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet";
const PoolingType type, std::vector<index_t> output_shape(4);
Tensor *output) { std::vector<int> paddings(2);
if (type != AVG) { std::vector<index_t> filter_shape = {
MACE_NOT_IMPLEMENTED; kernels_[0], kernels_[1],
} input->dim(3), input->dim(3)
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t out_height = output->dim(2);
index_t out_width = output->dim(3);
index_t channel_blk = (channels + 3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(batch),
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(out_height * out_width),
}; };
auto runtime = OpenCLRuntime::Get(); kernels::CalcNHWCPaddingAndOutputSize(
std::set<std::string> built_options; input->shape().data(), filter_shape.data(),
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype())); dilations_, strides_, this->padding_,
auto pooling_kernel = runtime->BuildKernel("pooling", "poolingn", built_options); output_shape.data(), paddings.data());
const uint32_t lws[3] = {1, 8, 128}; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0; Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); DataTypeToEnum<T>::value, output);
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
pooling_kernel.setArg(idx++, stride[0]);
pooling_kernel.setArg(idx++, paddings[0]);
pooling_kernel.setArg(idx++, paddings[1]);
pooling_kernel.setArg(idx++, pooling_size);
pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
template <>
void PoolingFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
Tensor *output) {
int paddings[2];
std::vector<index_t> filter_shape = {input->dim(1), input->dim(0),
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input->shape().data(), filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
#define POOLING_HELPER \
switch(kernels_[0]) { \
case 3: \
Pooling3(input, strides_, pooling_type_, output); \
break; \
default: \
PoolingN(input, strides_, paddings, kernels_[0], \
pooling_type_, output); \
break; \
}
if (paddings[0] > 0 || paddings[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
ConstructInputWithPadding(input, paddings, &padded_input, pooling_type_ == MAX);
input = &padded_input;
POOLING_HELPER
} else {
POOLING_HELPER
}
#undef POOLING_HELPER
} }
template
struct PoolingFunctor<DeviceType::OPENCL, float>;
template
struct PoolingFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -18,36 +18,66 @@ enum PoolingType { ...@@ -18,36 +18,66 @@ enum PoolingType {
namespace kernels { namespace kernels {
template <DeviceType D, typename T> struct PoolingFunctorBase {
struct PoolingFunctor { PoolingFunctorBase(const PoolingType pooling_type,
PoolingFunctor(const PoolingType pooling_type, const int *kernels,
const int *kernels, const int *strides,
const int *strides, const Padding padding,
const Padding padding, const int *dilations)
const int *dilations)
: pooling_type_(pooling_type), : pooling_type_(pooling_type),
kernels_(kernels), kernels_(kernels),
strides_(strides), strides_(strides),
padding_(padding), padding_(padding),
dilations_(dilations) {} dilations_(dilations) {}
const PoolingType pooling_type_;
const int *kernels_;
const int *strides_;
const Padding padding_;
const int *dilations_;
};
template<DeviceType D, typename T>
struct PoolingFunctor : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding padding,
const int *dilations)
: PoolingFunctorBase(pooling_type, kernels,
strides, padding,
dilations) {}
void operator()(const Tensor *input_tensor, void operator()(const Tensor *input_tensor,
Tensor *output_tensor) { Tensor *output_tensor) {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape = {
kernels_[0], kernels_[1],
input_tensor->dim(3), input_tensor->dim(3)
};
kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(),
dilations_, strides_, this->padding_,
output_shape.data(), paddings.data());
output_tensor->Resize(output_shape);
Tensor::MappingGuard in_guard(input_tensor); Tensor::MappingGuard in_guard(input_tensor);
Tensor::MappingGuard out_guard(output_tensor); Tensor::MappingGuard out_guard(output_tensor);
const T *input = input_tensor->data<T>(); const T *input = input_tensor->data<T>();
T *output = output_tensor->mutable_data<T>(); T *output = output_tensor->mutable_data<T>();
const index_t *input_shape = input_tensor->shape().data(); const index_t *input_shape = input_tensor->shape().data();
const index_t *output_shape = output_tensor->shape().data();
index_t batch = output_shape[0]; index_t batch = output_shape[0];
index_t channels = output_shape[1]; index_t height = output_shape[1];
index_t height = output_shape[2]; index_t width = output_shape[2];
index_t width = output_shape[3]; index_t channels = output_shape[3];
index_t out_image_size = height * width; index_t out_image_size = height * width;
index_t input_channels = input_shape[1]; index_t input_height = input_shape[1];
index_t input_height = input_shape[2]; index_t input_width = input_shape[2];
index_t input_width = input_shape[3]; index_t input_channels = input_shape[3];
index_t in_image_size = input_height * input_width; index_t in_image_size = input_height * input_width;
int kernel_h = kernels_[0]; int kernel_h = kernels_[0];
...@@ -59,11 +89,6 @@ struct PoolingFunctor { ...@@ -59,11 +89,6 @@ struct PoolingFunctor {
int dilation_h = dilations_[0]; int dilation_h = dilations_[0];
int dilation_w = dilations_[1]; int dilation_w = dilations_[1];
int paddings[2];
std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
// The left-upper most offset of the padded input // The left-upper most offset of the padded input
int padded_h_start = 0 - paddings[0] / 2; int padded_h_start = 0 - paddings[0] / 2;
int padded_w_start = 0 - paddings[1] / 2; int padded_w_start = 0 - paddings[1] / 2;
...@@ -71,25 +96,24 @@ struct PoolingFunctor { ...@@ -71,25 +96,24 @@ struct PoolingFunctor {
if (pooling_type_ == MAX) { if (pooling_type_ == MAX) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
for (int c = 0; c < channels; ++c) { for (int h = 0; h < height; ++h) {
index_t out_offset = (b * channels + c) * out_image_size; for (int w = 0; w < width; ++w) {
index_t in_offset = (b * input_channels + c) * in_image_size; for (int c = 0; c < channels; ++c) {
for (int h = 0; h < height; ++h) { index_t in_offset = b * in_image_size * input_channels + c;
for (int w = 0; w < width; ++w) { T res = std::numeric_limits<T>::lowest();
T max = std::numeric_limits<T>::lowest();
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) { for (int kw = 0; kw < kernel_w; ++kw) {
int inh = padded_h_start + h * stride_h + dilation_h * kh; int inh = padded_h_start + h * stride_h + dilation_h * kh;
int inw = padded_w_start + w * stride_w + dilation_w * kw; int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 && if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) { inw < input_width) {
index_t input_offset = in_offset + inh * input_width + inw; index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
max = std::max(max, input[input_offset]); res = std::max(res, input[input_offset]);
} }
} }
} }
output[out_offset] = max; *output = res;
out_offset += 1; output++;
} }
} }
} }
...@@ -97,11 +121,10 @@ struct PoolingFunctor { ...@@ -97,11 +121,10 @@ struct PoolingFunctor {
} else if (pooling_type_ == AVG) { } else if (pooling_type_ == AVG) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
for (int c = 0; c < channels; ++c) { for (int h = 0; h < height; ++h) {
index_t out_offset = (b * channels + c) * out_image_size; for (int w = 0; w < width; ++w) {
index_t in_offset = (b * input_channels + c) * in_image_size; for (int c = 0; c < channels; ++c) {
for (int h = 0; h < height; ++h) { index_t in_offset = b * in_image_size * input_channels + c;
for (int w = 0; w < width; ++w) {
T sum = 0; T sum = 0;
int block_size = 0; int block_size = 0;
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
...@@ -110,14 +133,14 @@ struct PoolingFunctor { ...@@ -110,14 +133,14 @@ struct PoolingFunctor {
int inw = padded_w_start + w * stride_w + dilation_w * kw; int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 && if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) { inw < input_width) {
index_t input_offset = in_offset + inh * input_width + inw; index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
sum += input[input_offset]; sum += input[input_offset];
block_size += 1; block_size += 1;
} }
} }
} }
output[out_offset] = sum / block_size; *output = sum / block_size;
out_offset += 1; output++;
} }
} }
} }
...@@ -125,22 +148,26 @@ struct PoolingFunctor { ...@@ -125,22 +148,26 @@ struct PoolingFunctor {
} }
} }
const PoolingType pooling_type_;
const int *kernels_;
const int *strides_;
const Padding padding_;
const int *dilations_;
}; };
template <> template<>
void PoolingFunctor<DeviceType::NEON, float>::operator()( void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor, const Tensor *input_tensor,
Tensor *output_tensor); Tensor *output_tensor);
template <> template<typename T>
void PoolingFunctor<DeviceType::OPENCL, float>::operator()( struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
const Tensor *input_tensor, PoolingFunctor(const PoolingType pooling_type,
Tensor *output_tensor); const int *kernels,
const int *strides,
const Padding padding,
const int *dilations)
: PoolingFunctorBase(pooling_type, kernels,
strides, padding,
dilations) {}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor);
};
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase<D, T> { ...@@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape(4);
// TODO(chenghui): is it kind of a hack?
filter_shape[0] = input->shape()[1];
filter_shape[1] = input->shape()[0];
filter_shape[2] = kernels_[0];
filter_shape[3] = kernels_[1];
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), this->dilations_.data(),
this->strides_.data(), this->padding_, output_shape.data(),
paddings.data());
output->Resize(output_shape);
functor_(input, output); functor_(input, output);
return true; return true;
}; };
......
...@@ -28,48 +28,20 @@ TEST_F(PoolingOpTest, MAX_VALID) { ...@@ -28,48 +28,20 @@ TEST_F(PoolingOpTest, MAX_VALID) {
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 2, 4, 4}, "Input", {1, 4, 4, 2},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
// Run // Run
net.RunOp(); net.RunOp();
// Check // Check
auto expected = auto expected =
CreateTensor<float>({1, 2, 2, 2}, {5, 7, 13, 15, 21, 23, 29, 31}); CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
TEST_F(PoolingOpTest, AVG_VALID) {
// Construct graph
auto &net = test_net();
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::AVG)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 2, 4, 4},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
// Run
net.RunOp();
// Check
auto expected = CreateTensor<float>(
{1, 2, 2, 2}, {2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(PoolingOpTest, MAX_SAME) { TEST_F(PoolingOpTest, MAX_SAME) {
// Construct graph // Construct graph
...@@ -85,14 +57,14 @@ TEST_F(PoolingOpTest, MAX_SAME) { ...@@ -85,14 +57,14 @@ TEST_F(PoolingOpTest, MAX_SAME) {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 1, 3, 3}, net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8}); {0, 1, 2, 3, 4, 5, 6, 7, 8});
// Run // Run
net.RunOp(); net.RunOp();
// Check // Check
auto expected = CreateTensor<float>({1, 1, 2, 2}, {4, 5, 7, 8}); auto expected = CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
...@@ -112,14 +84,14 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { ...@@ -112,14 +84,14 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 1, 4, 4}, "Input", {1, 4, 4, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
// Run // Run
net.RunOp(); net.RunOp();
// Check // Check
auto expected = CreateTensor<float>({1, 1, 2, 2}, {10, 11, 14, 15}); auto expected = CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
...@@ -139,42 +111,57 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { ...@@ -139,42 +111,57 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
// Add input data // Add input data
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 1, 2, 9}, "Input", {1, 2, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
// Run // Run
net.RunOp(DeviceType::NEON); net.RunOp();
// Check // Check
auto expected = CreateTensor<float>({1, 1, 1, 5}, {10, 12, 14, 16, 17}); auto expected = CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
template<DeviceType D>
template <DeviceType D>
static void SimpleMaxPooling3S2() { static void SimpleMaxPooling3S2() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {1, 1, 3, 9}, "Input", {1, 3, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}); 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
// Run
net.RunOp(D); if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
// Run
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
}
// Check // Check
auto expected = CreateTensor<float>({1, 1, 1, 4}, {20, 22, 24, 26}); auto expected = CreateTensor<float>({1, 1, 4, 1}, {20, 22, 24, 26});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
...@@ -182,15 +169,15 @@ static void SimpleMaxPooling3S2() { ...@@ -182,15 +169,15 @@ static void SimpleMaxPooling3S2() {
TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<CPU>(); SimpleMaxPooling3S2<CPU>();
} }
TEST_F(PoolingOpTest, NEONSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<NEON>();
}
TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) { TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<OPENCL>(); SimpleMaxPooling3S2<OPENCL>();
} }
template <DeviceType D> template<DeviceType D>
static void AlignedMaxPooling3S2(Padding padding) { static void MaxPooling3S2(const std::vector<index_t> &input_shape,
const std::vector<int> strides,
Padding padding) {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
...@@ -198,22 +185,33 @@ static void AlignedMaxPooling3S2(Padding padding) { ...@@ -198,22 +185,33 @@ static void AlignedMaxPooling3S2(Padding padding) {
.Output("Output") .Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX) .AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3}) .AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", strides)
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", {3, 128, 64, 64}); net.AddRandomInput<D, float>("Input", input_shape);
// Run
net.RunOp(D); // run on cpu
net.RunOp();
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// Run on cpu BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
net.RunOp(); OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
} }
// TODO(chenghui) : there is a bug. // TODO(chenghui) : there is a bug.
...@@ -223,152 +221,140 @@ static void AlignedMaxPooling3S2(Padding padding) { ...@@ -223,152 +221,140 @@ static void AlignedMaxPooling3S2(Padding padding) {
//} //}
TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) { TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) {
AlignedMaxPooling3S2<OPENCL>(Padding::VALID); MaxPooling3S2<OPENCL>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
AlignedMaxPooling3S2<OPENCL>(Padding::SAME); MaxPooling3S2<OPENCL>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
MaxPooling3S2<OPENCL>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
MaxPooling3S2<OPENCL>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
} }
template <DeviceType D> TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
static void UnalignedMaxPooling3S2(Padding padding) { MaxPooling3S2<OPENCL>({3, 41, 43, 47}, {1, 1}, Padding::VALID);
MaxPooling3S2<OPENCL>({3, 41, 43, 47}, {2, 2}, Padding::VALID);
MaxPooling3S2<OPENCL>({3, 41, 43, 47}, {1, 1}, Padding::SAME);
MaxPooling3S2<OPENCL>({3, 41, 43, 47}, {2, 2}, Padding::SAME);
}
TEST_F(PoolingOpTest, AVG_VALID) {
// Construct graph // Construct graph
OpsTestNet net; auto &net = test_net();
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX) .AddIntsArg("kernels", {2, 2})
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", padding) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::AVG)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", {3, 113, 43, 47}); net.AddInputFromArray<DeviceType::CPU, float>(
// Run "Input", {1, 4, 4, 2},
net.RunOp(D); {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
Tensor expected; 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
expected.Copy(*net.GetOutput("Output"));
// Run on cpu // Run
net.RunOp(); net.RunOp();
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001); // Check
} auto expected = CreateTensor<float>(
{1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
// TODO(chenghui) : there is a bug.
//TEST_F(PoolingOpTest, NEONUnalignedMaxPooling3S2) {
// UnalignedMaxPooling3S2<NEON>();
//}
TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) { ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
UnalignedMaxPooling3S2<OPENCL>(Padding::VALID);
UnalignedMaxPooling3S2<OPENCL>(Padding::SAME);
} }
template <DeviceType D> template<DeviceType D>
static void SimpleAvgPoolingTest() { static void SimpleAvgPoolingTest() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 2, 8, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("Input") .Input("InputImage")
.Output("Output") .Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG) .AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {2, 2}) .AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 1, 2, 8},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
// Run // Run
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
// Check // Check
auto expected = CreateTensor<float>({1, 1, 1, 4}, {4.5, 6.5, 8.5, 10.5}); auto expected = CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
TEST_F(PoolingOpTest, NEONSimpleAvgPooling) {
SimpleAvgPoolingTest<NEON>();
}
TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) { TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
SimpleAvgPoolingTest<OPENCL>(); SimpleAvgPoolingTest<OPENCL>();
} }
template <DeviceType D> template<DeviceType D>
static void AlignedAvgPoolingTest(Padding padding) { static void AvgPoolingTest(const std::vector<index_t> &shape,
const std::vector<int> &kernels,
const std::vector<int> &strides,
Padding padding) {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntArg("pooling_type", PoolingType::AVG) .AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {4, 4}) .AddIntsArg("kernels", kernels)
.AddIntsArg("strides", {4, 4}) .AddIntsArg("strides", strides)
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", {3, 128, 15, 15}); net.AddRandomInput<D, float>("Input", shape);
// Run
net.RunOp(D);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on cpu // run on cpu
net.RunOp(); net.RunOp();
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5); BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
}
TEST_F(PoolingOpTest, NEONAlignedAvgPooling) {
AlignedAvgPoolingTest<NEON>(Padding::VALID);
AlignedAvgPoolingTest<NEON>(Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
AlignedAvgPoolingTest<OPENCL>(Padding::VALID);
AlignedAvgPoolingTest<OPENCL>(Padding::SAME);
}
template <DeviceType D>
static void UnAlignedAvgPoolingTest(Padding padding) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("Input") .Input("InputImage")
.Output("Output") .Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG) .AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {7, 7}) .AddIntsArg("kernels", kernels)
.AddIntsArg("strides", {7, 7}) .AddIntsArg("strides", strides)
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {3, 128, 31, 37});
// Run
net.RunOp(D); net.RunOp(D);
Tensor expected; ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
expected.Copy(*net.GetOutput("Output"));
// Run on cpu ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
net.RunOp(); }
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5); TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
AvgPoolingTest<OPENCL>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
AvgPoolingTest<OPENCL>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
} }
TEST_F(PoolingOpTest, NEONUnAlignedAvgPooling) { TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) {
UnAlignedAvgPoolingTest<NEON>(Padding::VALID); AvgPoolingTest<OPENCL>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID);
UnAlignedAvgPoolingTest<NEON>(Padding::SAME); AvgPoolingTest<OPENCL>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME);
} }
TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) { TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) {
UnAlignedAvgPoolingTest<OPENCL>(Padding::VALID); AvgPoolingTest<OPENCL>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::VALID);
UnAlignedAvgPoolingTest<OPENCL>(Padding::SAME); AvgPoolingTest<OPENCL>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::SAME);
} }
TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
AvgPoolingTest<OPENCL>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::VALID);
AvgPoolingTest<OPENCL>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::SAME);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册