提交 69f4b8b1 编写于 作者: 刘琦

Merge branch 'pooling-opencl' into 'master'

Pooling opencl kernel(just support icnet pooling ops)

See merge request !99
......@@ -207,7 +207,11 @@ class Tensor {
os.str("");
os.clear();
MappingGuard guard(this);
for (int i = 0; i < size_; ++i) {
if ( i != 0 && i % shape_[3] == 0) {
os << "\n";
}
CASES(dtype_, (os << this->data<T>()[i]) << ", ");
}
LOG(INFO) << os.str();
......
......@@ -61,9 +61,11 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
// based on the model accuracy.
padding_size[0] =
(output_height - 1) * strides[0] + k_extent_height - input_shape[2];
std::max<int>(0, (output_height - 1) * strides[0]
+ k_extent_height - input_shape[2]);
padding_size[1] =
(output_width - 1) * strides[1] + k_extent_width - input_shape[3];
std::max<int>(0, (output_width - 1) * strides[1]
+ k_extent_width - input_shape[3]);
output_shape[0] = input_shape[0];
output_shape[1] = output_channels;
......@@ -110,15 +112,21 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
padding_size[0] =
(output_height - 1) * strides[0] + k_extent_height - input_shape[2];
std::max<int>(0, (output_height - 1) * strides[0]
+ k_extent_height - input_shape[2]);
padding_size[1] =
(output_width - 1) * strides[1] + k_extent_width - input_shape[3];
std::max<int>(0, (output_width - 1) * strides[1]
+ k_extent_width - input_shape[3]);
}
void ConstructInputWithPadding(const float *input,
const index_t *input_shape,
void ConstructInputWithPadding(const Tensor *input_tensor,
const int *paddings,
Tensor *output_tensor) {
Tensor *output_tensor,
bool padding_same_value) {
Tensor::MappingGuard input_mapper(input_tensor);
const float *input = input_tensor->data<float>();
const index_t *input_shape = input_tensor->shape().data();
index_t batch = input_shape[0];
index_t channels = input_shape[1];
index_t height = input_shape[2];
......@@ -133,21 +141,51 @@ void ConstructInputWithPadding(const float *input,
output_tensor->Resize(output_shape);
Tensor::MappingGuard padded_input_mapper(output_tensor);
Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_ptr = output_tensor->mutable_data<float>();
memset(output_ptr, 0, output_tensor->size() * sizeof(float));
// Skip the padded top rows
output_ptr += padded_top * output_width;
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
for (int k = 0; k < height; ++k) {
memcpy(output_ptr + padded_left, input, width * sizeof(float));
if (padding_same_value) {
#define COPY_INPUT \
std::fill(output_ptr, output_ptr+padded_left, input[0]); \
output_ptr += padded_left; \
memcpy(output_ptr, input, width * sizeof(float)); \
output_ptr += width; \
std::fill(output_ptr , output_ptr + padded_right, input[width-1]); \
output_ptr += padded_right;
const int padded_bottom = paddings[0] - padded_top;
const int padded_right = paddings[1] - padded_left;
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
for (int k = 0; k < padded_top; ++k) {
COPY_INPUT;
}
for (int k = 0; k < height; ++k) {
COPY_INPUT;
input += width;
}
input -= width;
for (int k = 0; k < padded_bottom; ++k) {
COPY_INPUT;
}
input += width;
output_ptr += output_width;
}
// Skip the padded bottom in this channel and top in the next channel
output_ptr += paddings[0] * output_width;
}
#undef COPY_INPUT
} else {
output_ptr += padded_top * output_width;
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
for (int k = 0; k < height; ++k) {
memcpy(output_ptr + padded_left, input, width * sizeof(float));
input += width;
output_ptr += output_width;
}
// Skip the padded bottom in this channel and top in the next channel
output_ptr += paddings[0] * output_width;
}
}
}
}
......
......@@ -32,10 +32,10 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
Padding padding,
int *padding_size);
void ConstructInputWithPadding(const float *input,
const index_t *input_shape,
void ConstructInputWithPadding(const Tensor *input,
const int *paddings,
Tensor *output_tensor);
Tensor *output_tensor,
bool padding_same_value = false);
} // namespace kernels
} // namespace mace
......
......@@ -75,6 +75,12 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
return;
}
Tensor padded_input;
// Keep this alive during kernel execution
if (paddings_[0] > 0 || paddings_[1] > 0) {
ConstructInputWithPadding(input, paddings_.data(), &padded_input);
input = &padded_input;
}
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard bias_mapper(bias);
......@@ -86,14 +92,6 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
auto output_data = output->mutable_data<float>();
auto output_shape = output->shape().data();
// Keep this alive during kernel execution
Tensor padded_input;
if (paddings_[0] > 0 || paddings_[1] > 0) {
ConstructInputWithPadding(input_data, input->shape().data(),
paddings_.data(), &padded_input);
input_data = padded_input.data<float>();
input_shape = padded_input.shape().data();
}
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input_data, input_shape, filter_data, nullptr,
bias_data, output_data, output_shape);
......
......@@ -67,8 +67,7 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
// Keep this alive during kernel execution
Tensor padded_input;
if (paddings_[0] > 0 || paddings_[1] > 0) {
ConstructInputWithPadding(input_ptr, input_shape, paddings_.data(),
&padded_input);
ConstructInputWithPadding(input, paddings_.data(), &padded_input);
input_ptr = padded_input.data<float>();
input_shape = padded_input.shape().data();
}
......
......@@ -3,8 +3,6 @@
//
#include "mace/kernels/pooling.h"
#include <arm_neon.h>
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace {
namespace kernels {
......@@ -57,13 +55,22 @@ extern void PoolingAvgNeonK3x3S2x2Padded(const float *input,
template <>
void PoolingFunctor<DeviceType::NEON, float>::operator()(
const float *input,
const index_t *input_shape,
float *output,
const index_t *output_shape) {
const Tensor *input_tensor,
Tensor *output_tensor) {
const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>();
const index_t *input_shape = input_tensor->shape().data();
const index_t *output_shape = output_tensor->shape().data();
int paddings[2];
std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
#ifdef __COPY_MAKE_PADDING
Tensor padded_input;
ConstructInputWithPadding(input, input_shape, paddings_, &padded_input);
ConstructInputWithPadding(input_tensor, paddings, &padded_input);
input = padded_input.data<float>();
input_shape = padded_input.shape().data();
#endif
......@@ -76,14 +83,14 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
#else
PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape,
paddings_);
paddings);
#endif
} else { // AVG_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
#else
PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape,
paddings_);
paddings);
#endif
}
} else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 &&
......@@ -94,20 +101,20 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
#else
PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape,
paddings_);
paddings);
#endif
} else { // AVG_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
#else
PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape,
paddings_);
paddings);
#endif
}
} else { // not implement yet
PoolingFunctor<DeviceType::CPU, float>(pooling_type_, kernels_, strides_,
paddings_, dilations_)(
input, input_shape, output, output_shape);
padding_, dilations_)(
input_tensor, output_tensor);
}
}
......
float4 vec_pooling_3_s1(const float *input_ptr, const int in_width) {
float4 row00 = vload4(0, input_ptr);
float2 row01 = vload2(0, input_ptr + 4);
float4 row10 = vload4(0, input_ptr + in_width);
float2 row11 = vload2(0, input_ptr + in_width + 4);
float4 row20 = vload4(0, input_ptr + in_width * 2);
float2 row21 = vload2(0, input_ptr + in_width * 2 + 4);
float8 data00 = (float8)(row00.s01212323);
float4 data01 = (float4)(row01.s0, row00.s3, row01.s01);
float8 data10 = (float8)(row10.s01212323);
float4 data11 = (float4)(row11.s0, row10.s3, row11.s01);
float8 data20 = (float8)(row20.s01212323);
float4 data21 = (float4)(row21.s0, row20.s3, row21.s01);
float8 left = fmax(fmax(data00, data10), data20);
float4 right = fmax(fmax(data01, data11), data21);
float4 res = fmax((float4)(left.s036, right.s1), (float4)(left.s147, right.s2));
res = fmax(res, (float4)(left.s25, right.s03));
return res;
}
float4 vec_pooling_3_s2(const float *input_ptr, const int in_width) {
float8 row00 = vload8(0, input_ptr);
float row01 = *(input_ptr + 8);
float8 row10 = vload8(0, input_ptr + in_width);
float row11 = *(input_ptr + in_width + 8);
float8 row20 = vload8(0, input_ptr + in_width * 2);
float row21 = *(input_ptr + in_width * 2 + 8);
float8 data00 = (float8)(row00.s01223445);
float4 data01 = (float4)(row00.s667, row01);
float8 data10 = (float8)(row10.s01223445);
float4 data11 = (float4)(row10.s667, row11);
float8 data20 = (float8)(row20.s01223445);
float4 data21 = (float4)(row20.s667, row21);
float8 left = fmax(fmax(data00, data10), data20);
float4 right = fmax(fmax(data01, data11), data21);
float4 res = fmax((float4)(left.s036, right.s1), (float4)(left.s147, right.s2));
res = fmax(res, (float4)(left.s25, right.s03));
return res;
}
float inner_pooling_3(const float *input_ptr, const int in_width) {
float3 row0 = vload3(0, input_ptr);
float3 row1 = vload3(0, input_ptr + in_width);
float3 row2 = vload3(0, input_ptr + in_width * 2);
float3 data = fmax(fmax(row0, row1), row2);
float res = fmax(fmax(data.s0, data.s1), data.s2);
return res;
}
__kernel void pooling3(__global const float *input, /* n, c, h, w */
__private const int in_height,
__private const int in_width,
__private const int out_chan_num,
__private const int out_height,
__private const int out_width,
__private const int stride,
__global float *output) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const int round_out_width = (out_width + 3) / 4;
const int out_pixel_height = out_pixel_blk / round_out_width;
const int out_pixel_width = out_pixel_blk % round_out_width;
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const int in_offset = batch * out_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const float *input_base = input + in_offset + in_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin;
const int pixels = out_pixel_end - out_pixel_begin;
for (int i = out_chan_begin; i < out_chan_end; ++i) {
const float *input_ptr = input_base + i * in_pixel;
float *output_ptr = output_base + i * out_pixel;
if (pixels == 4) {
float4 res;
if (stride == 1) {
res = vec_pooling_3_s1(input_ptr, in_width);
} else {
res = vec_pooling_3_s2(input_ptr, in_width);
}
vstore4(res, 0, output_ptr);
} else {
for (int p = 0; p < pixels; ++p) {
output_ptr[p] = inner_pooling_3(input_ptr, in_width);
input_ptr += stride;
}
}
}
}
int calculate_avg_block_size(const int pos_h,
const int pos_w,
const int pool_size,
const int pad_h,
const int pad_w,
const int h_size,
const int w_size) {
const int h_start = max(0, pos_h - pad_h);
const int w_start = max(0, pos_w - pad_w);
const int h_end = min(pos_h + pool_size - pad_h, h_size);
const int w_end = min(pos_w + pool_size - pad_w, w_size);
return (h_end - h_start) * (w_end - w_start);
}
__kernel void poolingn(__global const float *input, /* n, c, h, w */
__private const int in_height,
__private const int in_width,
__private const int out_chan_num,
__private const int out_height,
__private const int out_width,
__private const int stride,
__private const int pad_h,
__private const int pad_w,
__private const int pooling_size,
__global float *output) {
int batch = get_global_id(0);
int out_chan_idx = get_global_id(1);
int out_pixel_idx = get_global_id(2);
const int out_pixel_height = out_pixel_idx / out_width;
const int out_pixel_width = out_pixel_idx % out_width;
const int out_chan_begin = out_chan_idx * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int in_pixel_idx = out_pixel_height * stride * in_width
+ out_pixel_width * stride;
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const int in_offset = batch * out_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const float *input_base = input + in_offset + in_pixel_idx;
float *output_base = output + out_offset + out_pixel_idx;
const int block_size = calculate_avg_block_size(
out_pixel_height * stride,
out_pixel_width * stride,
pooling_size,
pad_h/2,
pad_w/2,
in_height - pad_h,
in_width - pad_w);
for (int i = out_chan_begin; i < out_chan_end; ++i) {
float8 sum8 = 0.0f;
float sum1 = 0.0f;
float *output_ptr = output_base + i * out_pixel;
for (int y = 0; y < pooling_size; ++y) {
const float *input_ptr = input_base + i * in_pixel + y * in_width;
int x = 0;
for (; x < (pooling_size-8); x += 8) {
float8 data = vload8(0, input_ptr);
sum8 += data;
input_ptr += 8;
}
for (; x < pooling_size; ++x) {
sum1 += *input_ptr;
input_ptr++;
}
}
float4 sum4 = sum8.s0123 + sum8.s4567;
float2 sum2 = sum4.s01 + sum4.s23;
*output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size;
}
}
......@@ -47,9 +47,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
if (paddings_[0] > 0 || paddings_[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
Tensor::MappingGuard input_mapper(input);
ConstructInputWithPadding(input->data<float>(), input->shape().data(), paddings_.data(),
&padded_input);
ConstructInputWithPadding(input, paddings_.data(), &padded_input);
conv2d_func(&padded_input, filter, bias, output);
}else {
conv2d_func(input, filter, bias, output);
......
......@@ -45,9 +45,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
if (paddings_[0] > 0 || paddings_[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
Tensor::MappingGuard input_mapper(input);
ConstructInputWithPadding(input->data<float>(), input->shape().data(), paddings_.data(),
&padded_input);
ConstructInputWithPadding(input, paddings_.data(), &padded_input);
conv2d_func(&padded_input, filter, bias, output);
}else {
conv2d_func(input, filter, bias, output);
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/pooling.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
namespace kernels {
static void Pooling3(const Tensor *input,
const int *stride,
const PoolingType type,
Tensor *output) {
if (type != MAX) {
MACE_NOT_IMPLEMENTED;
}
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t out_height = output->dim(2);
index_t out_width = output->dim(3);
index_t channel_blk = (channels + 3) / 4;
const index_t pixel_width = (out_width + 3) / 4 ;
const uint32_t gws[3] = {
static_cast<uint32_t>(batch),
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(pixel_width * out_height),
};
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto max_pooling_kernel = cl::Kernel(program, "pooling3");
const uint32_t lws[3] = {1, 8, 128};
uint32_t idx = 0;
max_pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
max_pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
max_pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
max_pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
max_pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
max_pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
max_pooling_kernel.setArg(idx++, stride[0]);
max_pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
max_pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
MACE_CHECK(error == CL_SUCCESS);
}
static void PoolingN(const Tensor *input,
const int *stride,
const int *paddings,
const int pooling_size,
const PoolingType type,
Tensor *output) {
if (type != AVG) {
MACE_NOT_IMPLEMENTED;
}
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t out_height = output->dim(2);
index_t out_width = output->dim(3);
index_t channel_blk = (channels + 3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(batch),
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(out_height * out_width),
};
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto pooling_kernel = cl::Kernel(program, "poolingn");
const uint32_t lws[3] = {1, 8, 128};
uint32_t idx = 0;
pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
pooling_kernel.setArg(idx++, stride[0]);
pooling_kernel.setArg(idx++, paddings[0]);
pooling_kernel.setArg(idx++, paddings[1]);
pooling_kernel.setArg(idx++, pooling_size);
pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
MACE_CHECK(error == CL_SUCCESS);
}
template <>
void PoolingFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
Tensor *output) {
int paddings[2];
std::vector<index_t> filter_shape = {input->dim(1), input->dim(0),
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input->shape().data(), filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
#define POOLING_HELPER \
switch(kernels_[0]) { \
case 3: \
Pooling3(input, strides_, pooling_type_, output); \
break; \
default: \
PoolingN(input, strides_, paddings, kernels_[0], \
pooling_type_, output); \
break; \
}
if (paddings[0] > 0 || paddings[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
ConstructInputWithPadding(input, paddings, &padded_input, pooling_type_ == MAX);
input = &padded_input;
POOLING_HELPER
} else {
POOLING_HELPER
}
#undef POOLING_HELPER
}
} // namespace kernels
} // namespace mace
......@@ -7,6 +7,7 @@
#include <limits>
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace {
......@@ -22,18 +23,22 @@ struct PoolingFunctor {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
const int *strides,
const int *paddings,
const Padding padding,
const int *dilations)
: pooling_type_(pooling_type),
kernels_(kernels),
strides_(strides),
paddings_(paddings),
padding_(padding),
dilations_(dilations) {}
void operator()(const T *input,
const index_t *input_shape,
T *output,
const index_t *output_shape) {
void operator()(const Tensor *input_tensor,
Tensor *output_tensor) {
Tensor::MappingGuard in_guard(input_tensor);
Tensor::MappingGuard out_guard(output_tensor);
const T *input = input_tensor->data<T>();
T *output = output_tensor->mutable_data<T>();
const index_t *input_shape = input_tensor->shape().data();
const index_t *output_shape = output_tensor->shape().data();
index_t batch = output_shape[0];
index_t channels = output_shape[1];
index_t height = output_shape[2];
......@@ -54,9 +59,14 @@ struct PoolingFunctor {
int dilation_h = dilations_[0];
int dilation_w = dilations_[1];
int paddings[2];
std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
kernels_[0], kernels_[1]};
kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
strides_, this->padding_, paddings);
// The left-upper most offset of the padded input
int padded_h_start = 0 - paddings_[0] / 2;
int padded_w_start = 0 - paddings_[1] / 2;
int padded_h_start = 0 - paddings[0] / 2;
int padded_w_start = 0 - paddings[1] / 2;
if (pooling_type_ == MAX) {
#pragma omp parallel for collapse(2)
......@@ -93,6 +103,7 @@ struct PoolingFunctor {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
T sum = 0;
int block_size = 0;
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
int inh = padded_h_start + h * stride_h + dilation_h * kh;
......@@ -101,10 +112,11 @@ struct PoolingFunctor {
inw < input_width) {
index_t input_offset = in_offset + inh * input_width + inw;
sum += input[input_offset];
block_size += 1;
}
}
}
output[out_offset] = sum / (kernel_h * kernel_w);
output[out_offset] = sum / block_size;
out_offset += 1;
}
}
......@@ -116,16 +128,19 @@ struct PoolingFunctor {
const PoolingType pooling_type_;
const int *kernels_;
const int *strides_;
const int *paddings_;
const Padding padding_;
const int *dilations_;
};
template <>
void PoolingFunctor<DeviceType::NEON, float>::operator()(
const float *input,
const index_t *input_shape,
float *output,
const index_t *output_shape);
const Tensor *input_tensor,
Tensor *output_tensor);
template <>
void PoolingFunctor<DeviceType::OPENCL, float>::operator()(
const Tensor *input_tensor,
Tensor *output_tensor);
} // namespace kernels
} // namespace mace
......
......@@ -12,4 +12,5 @@ REGISTER_CPU_OPERATOR(Pooling, PoolingOp<DeviceType::CPU, float>);
REGISTER_NEON_OPERATOR(Pooling, PoolingOp<DeviceType::NEON, float>);
#endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(Pooling, PoolingOp<DeviceType::OPENCL, float>);
} // namespace mace
......@@ -19,7 +19,9 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
kernels_(OperatorBase::GetRepeatedArgument<int>("kernels")),
pooling_type_(
static_cast<PoolingType>(OperatorBase::GetSingleArgument<int>(
"pooling_type", static_cast<int>(AVG)))){};
"pooling_type", static_cast<int>(AVG)))),
functor_(pooling_type_, kernels_.data(), this->strides_.data(),
this->padding_, this->dilations_.data()){};
bool Run() override {
const Tensor *input = this->Input(INPUT);
......@@ -40,17 +42,14 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
paddings.data());
output->Resize(output_shape);
auto pooling_func = kernels::PoolingFunctor<D, T>(
pooling_type_, kernels_.data(), this->strides_.data(), paddings.data(),
this->dilations_.data());
pooling_func(input->data<float>(), input->shape().data(),
output->mutable_data<float>(), output->shape().data());
functor_(input, output);
return true;
};
protected:
std::vector<int> kernels_;
PoolingType pooling_type_;
kernels::PoolingFunctor<D, T> functor_;
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
......
......@@ -150,9 +150,11 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
template <DeviceType D>
static void SimpleMaxPooling3S2() {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
......@@ -164,12 +166,12 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
net.AddInputFromArray<D, float>(
"Input", {1, 1, 3, 9},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
// Run
net.RunOp(DeviceType::NEON);
net.RunOp(D);
// Check
auto expected = CreateTensor<float>({1, 1, 1, 4}, {20, 22, 24, 26});
......@@ -177,9 +179,95 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(PoolingOpTest, AVG_k2x2s2x2) {
TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<CPU>();
}
TEST_F(PoolingOpTest, NEONSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<NEON>();
}
TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<OPENCL>();
}
template <DeviceType D>
static void AlignedMaxPooling3S2(Padding padding) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {3, 128, 64, 64});
// Run
net.RunOp(D);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on cpu
net.RunOp();
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001);
}
// TODO(chenghui) : there is a bug.
//TEST_F(PoolingOpTest, NEONAlignedMaxPooling3S2) {
// AlignedMaxPooling3S2<NEON>(Padding::VALID);
// AlignedMaxPooling3S2<NEON>(Padding::SAME);
//}
TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) {
AlignedMaxPooling3S2<OPENCL>(Padding::VALID);
AlignedMaxPooling3S2<OPENCL>(Padding::SAME);
}
template <DeviceType D>
static void UnalignedMaxPooling3S2(Padding padding) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {3, 113, 43, 47});
// Run
net.RunOp(D);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on cpu
net.RunOp();
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001);
}
// TODO(chenghui) : there is a bug.
//TEST_F(PoolingOpTest, NEONUnalignedMaxPooling3S2) {
// UnalignedMaxPooling3S2<NEON>();
//}
TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
UnalignedMaxPooling3S2<OPENCL>(Padding::VALID);
UnalignedMaxPooling3S2<OPENCL>(Padding::SAME);
}
template <DeviceType D>
static void SimpleAvgPoolingTest() {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
......@@ -191,14 +279,96 @@ TEST_F(PoolingOpTest, AVG_k2x2s2x2) {
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
net.AddInputFromArray<D, float>(
"Input", {1, 1, 2, 8},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
// Run
net.RunOp(DeviceType::NEON);
net.RunOp(D);
// Check
auto expected = CreateTensor<float>({1, 1, 1, 4}, {4.5, 6.5, 8.5, 10.5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(PoolingOpTest, NEONSimpleAvgPooling) {
SimpleAvgPoolingTest<NEON>();
}
TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
SimpleAvgPoolingTest<OPENCL>();
}
template <DeviceType D>
static void AlignedAvgPoolingTest(Padding padding) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {4, 4})
.AddIntsArg("strides", {4, 4})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {3, 128, 15, 15});
// Run
net.RunOp(D);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on cpu
net.RunOp();
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5);
}
TEST_F(PoolingOpTest, NEONAlignedAvgPooling) {
AlignedAvgPoolingTest<NEON>(Padding::VALID);
AlignedAvgPoolingTest<NEON>(Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
AlignedAvgPoolingTest<OPENCL>(Padding::VALID);
AlignedAvgPoolingTest<OPENCL>(Padding::SAME);
}
template <DeviceType D>
static void UnAlignedAvgPoolingTest(Padding padding) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {7, 7})
.AddIntsArg("strides", {7, 7})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {3, 128, 31, 37});
// Run
net.RunOp(D);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on cpu
net.RunOp();
ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5);
}
TEST_F(PoolingOpTest, NEONUnAlignedAvgPooling) {
UnAlignedAvgPoolingTest<NEON>(Padding::VALID);
UnAlignedAvgPoolingTest<NEON>(Padding::SAME);
}
TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) {
UnAlignedAvgPoolingTest<OPENCL>(Padding::VALID);
UnAlignedAvgPoolingTest<OPENCL>(Padding::SAME);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册