提交 63ac1f35 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!4266 [MS][LITE] opencl bug fix for pooling

Merge pull request !4266 from chenzhongming/lite
__kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(0);
int Y = get_global_id(1);
......@@ -32,8 +32,7 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size,
const int2 padding) {
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(0);
int Y = get_global_id(1);
......@@ -49,17 +48,16 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
int xs = X * stride.x + padding.x;
int ys = Y * stride.y + padding.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
bool outside_x = x_c < 0 || x_c >= input_shape.x;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
bool outside = outside_x || y_c < 0 || y_c >= input_shape.y;
r += read_imagef(input, smp_zero, (int2)(x_c, y_c * input_shape.w + Z));
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
bool outside_y = y_c < 0 || y_c >= input_shape.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
window_size += !outside ? 1.0f : 0.0f;
}
}
float4 result = convert_float4(r / window_size);
write_imagef(output, (int2)(X, Y * output_shape.w + Z), result);
write_imagef(output, (int2)(Y * output_shape.w + Z, X), result);
}
......@@ -31,7 +31,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum;
}
__constant sampler_t sample_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
......@@ -48,20 +48,15 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d
float4 maximum = (float4)(-10000.0f);
int xs = X * stride.x + padding.x;
int ys = Y * stride.y + padding.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
if (x_c < 0 || x_c >= input_shape.x) {
continue;
}
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
if (y_c < 0 || y_c >= input_shape.y) {
continue;
}
float4 src = read_imagef(input, sample_none, (int2)(x_c, y_c * input_shape.w + Z));
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
if (y_c < 0 || y_c >= input_shape.y) continue;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
if (x_c < 0 || x_c >= input_shape.x) continue;
float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
maximum = max(src, maximum);
}
}
write_imagef(output, (int2)(X, Y * output_shape.w + Z), maximum);
write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum);
}
......@@ -82,7 +82,7 @@ int PoolingOpenCLKernel::Init() {
std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const {
const size_t global_x = outputs_[0]->Height();
const size_t global_y = outputs_[0]->Width();
const size_t global_z = UP_ROUND_DIV(outputs_[0]->Channel(), 4);
const size_t global_z = UP_DIV(outputs_[0]->Channel(), C4NUM);
std::vector<size_t> global = {global_x, global_y, global_z};
return global;
}
......@@ -90,13 +90,8 @@ std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const {
int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
size_t im_dst_x, im_dst_y;
if (inputs_[0]->GetFormat() == schema::Format_NHWC4) {
im_dst_x = outputs_[0]->Height();
im_dst_y = outputs_[0]->Width() * CO4;
} else {
im_dst_y = outputs_[0]->Width();
im_dst_x = outputs_[0]->Height() * CO4;
}
im_dst_x = outputs_[0]->Width() * CO4;
im_dst_y = outputs_[0]->Height();
#ifdef ENABLE_FP16
size_t img_dtype = CL_HALF_FLOAT;
#else
......@@ -117,7 +112,7 @@ int PoolingOpenCLKernel::Run() {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
// attribute
int slices = UP_ROUND_DIV(outputs_[0]->Channel(), 4);
int slices = UP_DIV(outputs_[0]->Channel(), C4NUM);
cl_int4 input_shape = {inputs_[0]->Height(), inputs_[0]->Width(), inputs_[0]->Channel(), slices};
cl_int4 output_shape = {outputs_[0]->Height(), outputs_[0]->Width(), outputs_[0]->Channel(), slices};
cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册