未验证 提交 fbce6342 编写于 作者: X xiebaiyuan 提交者: GitHub

[LITE][OPENCL][Image] fix global avg pool issue ,test=develop (#3028)

上级 f6c4370d
......@@ -15,17 +15,17 @@ limitations under the License. */
#include <cl_common.h>
__kernel void pool_max(__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int out_height,
__private const int out_width,
__private const int ksize_h,
__private const int ksize_w,
__private const int stride_h,
__private const int stride_w,
__private const int pad_top,
__private const int pad_left) {
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int out_height,
__private const int out_width,
__private const int ksize_h,
__private const int ksize_w,
__private const int stride_h,
__private const int stride_w,
__private const int pad_top,
__private const int pad_left) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
......@@ -37,18 +37,19 @@ __kernel void pool_max(__read_only image2d_t input,
int start_h = out_h * stride_h - pad_top;
int end_h = min(start_h + ksize_h, in_height);
start_h = max(start_h,0);
start_h = max(start_h, 0);
int start_w = out_w * stride_w - pad_left;
int end_w = min(start_w + ksize_w, in_width);
start_w = max(start_w,0);
start_w = max(start_w, 0);
const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height;
CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE);
for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) {
CL_DTYPE4 tmp = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
CL_DTYPE4 tmp = READ_IMG_TYPE(
CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
max_value = max(max_value, tmp);
}
}
......@@ -58,17 +59,17 @@ __kernel void pool_max(__read_only image2d_t input,
}
__kernel void pool_avg(__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int out_height,
__private const int out_width,
__private const int ksize_h,
__private const int ksize_w,
__private const int stride_h,
__private const int stride_w,
__private const int pad_top,
__private const int pad_left) {
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int out_height,
__private const int out_width,
__private const int ksize_h,
__private const int ksize_w,
__private const int stride_h,
__private const int stride_w,
__private const int pad_top,
__private const int pad_left) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
......@@ -90,10 +91,121 @@ __kernel void pool_avg(__read_only image2d_t input,
for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) {
sum += READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
sum += READ_IMG_TYPE(
CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
}
}
CL_DTYPE4 avg = sum / (ksize_h * ksize_w);
const int pos_out_x = mad24(out_c, out_width, out_w);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), avg);
}
__kernel void pool_avg_global(__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int out_height,
__private const int out_width,
__private const int ksize_h,
__private const int ksize_w,
__private const int stride_h,
__private const int stride_w,
__private const int pad_top,
__private const int pad_left) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1); // =1
const int out_nh = get_global_id(2); // = n*1
const int out_n = out_nh / out_height;
const int out_h = out_nh % out_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
// do not use dtype4 here
// skip issue for half 2048
float4 sum = (float4)(0.0f);
const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height;
for (int y = 0; y < in_height; ++y) {
for (int x = 0; x < in_width; ++x) {
half4 tmp = READ_IMG_TYPE(
CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
sum.x = convert_float(tmp.x) + sum.x;
sum.y = convert_float(tmp.y) + sum.y;
sum.z = convert_float(tmp.z) + sum.z;
sum.w = convert_float(tmp.w) + sum.w;
}
}
const float global_size_div = 1.0f / (in_height * in_width);
half4 avg;
avg.x = convert_half((sum.x * global_size_div));
avg.y = convert_half((sum.y * global_size_div));
avg.z = convert_half((sum.z * global_size_div));
avg.w = convert_half((sum.w * global_size_div));
#ifdef DEBUG
if (out_c == 0) {
printf("\033[31msum.x= %f \033 \n ", sum.x);
printf("sum.y=%f \n ", sum.y);
printf("sum.z=%f \n ", sum.z);
printf("sum.w=%f \n ", sum.w);
printf("one4.x=%f \n ", convert_float(one4.x));
printf("in_height=%d \n ", in_height);
printf("in_width=%d \n ", in_width);
printf("ksize_h=%d \n ", ksize_h);
printf("ksize_w=%d \n ", ksize_w);
printf("stride_h=%d \n ", stride_h);
printf("stride_w=%d \n ", stride_w);
printf("pad_top=%d \n ", pad_top);
printf("pad_left=%d \n ", pad_left);
printf("out_width=%d \n ", out_width);
printf("out_height=%d \n ", out_height);
printf("i++=%d \n ", i++);
printf("avg.x=%f \n ", convert_float(avg.x));
printf("avg.y=%f \n ", convert_float(avg.y));
printf("avg.z=%f \n ", convert_float(avg.z));
printf("avg.w=%f \n ", convert_float(avg.w));
}
#endif
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(out_c, out_nh), avg);
}
__kernel void pool_max_global(__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int out_height,
__private const int out_width,
__private const int ksize_h,
__private const int ksize_w,
__private const int stride_h,
__private const int stride_w,
__private const int pad_top,
__private const int pad_left) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1); // =1
const int out_nh = get_global_id(2); // = n*1
const int out_n = out_nh / out_height;
const int out_h = out_nh % out_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE);
const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height;
for (int y = 0; y < in_height; ++y) {
for (int x = 0; x < in_width; ++x) {
max_value = max(max_value,
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_x + x, pos_in_y + y)));
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(out_c, out_nh), max_value);
}
\ No newline at end of file
......@@ -13,6 +13,7 @@
// limitations under the License.
#include <vector>
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
......@@ -37,7 +38,12 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override {
const auto& param = *param_.get_mutable<param_t>();
kernel_func_name_ += param.pooling_type;
const bool global_pooling = param.global_pooling;
if (global_pooling) {
kernel_func_name_ += "_global";
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/pool_kernel.cl", build_options_);
......@@ -52,6 +58,10 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
std::vector<int> paddings = *param.paddings;
std::vector<int> strides = param.strides;
std::vector<int> ksize = param.ksize;
VLOG(4) << "global_pooling: " << global_pooling;
VLOG(4) << "pooling_type: " << pooling_type;
VLOG(4) << "paddings : " << paddings[0] << " " << paddings[1] << " "
<< paddings[2] << " " << paddings[3] << " ";
if (global_pooling) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[2 * i] = 0;
......@@ -59,6 +69,18 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
ksize[i] = static_cast<int>(in_dims[i + 2]);
}
}
VLOG(4) << "in_dims : [" << in_dims.size() << "]" << in_dims[0] << " "
<< in_dims[1] << " " << in_dims[2] << " " << in_dims[3];
VLOG(4) << "out_dims : [" << out_dims.size() << "]" << out_dims[0] << " "
<< out_dims[1] << " " << out_dims[2] << " " << out_dims[3];
VLOG(4) << "paddings fixed : " << paddings[0] << " " << paddings[1] << " "
<< paddings[2] << " " << paddings[3] << " ";
VLOG(4) << "strides : [" << strides.size() << "]" << strides[0] << " "
<< strides[1];
VLOG(4) << "ksize : [" << ksize.size() << "]" << ksize[0] << " "
<< ksize[1] << " " << ksize[2] << " " << ksize[3];
VLOG(4) << "paddings : [" << paddings.size() << "]" << paddings[0] << " "
<< paddings[1] << " " << paddings[2] << " " << paddings[3];
bool pads_equal =
(paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
if (!pads_equal) {
......@@ -86,7 +108,8 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
int w = out_dims[3];
int nh = out_dims[0] * out_dims[2];
auto global_work_size = cl::NDRange(c_block, w, nh);
VLOG(4) << "global_work_size : [" << 3 << "]" << c_block << " " << w
<< " " << nh << " ";
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, *x_img);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册