提交 8e442ce7 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!5635 [MS][LITE][GPU]fix bug in matmul and pooling

Merge pull request !5635 from chenzupeng/master-lite
__kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, #ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate // axis to dst tensor coordinate
int X = get_global_id(0); int X = get_global_id(0);
...@@ -10,10 +13,10 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, ...@@ -10,10 +13,10 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
return; return;
} }
float4 r = (float4)(0.0f); FLT4 r = (FLT4)(0.0f);
float window_size = 0.0f; FLT window_size = 0.0f;
int xs = X * stride.x + padding.x; int xs = X * stride.x - padding.x;
int ys = Y * stride.y + padding.y; int ys = Y * stride.y - padding.y;
for (int kx = 0; kx < kernel_size.x; ++kx) { for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx; int x_c = xs + kx;
...@@ -21,11 +24,11 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, ...@@ -21,11 +24,11 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
for (int ky = 0; ky < kernel_size.y; ++ky) { for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky; int y_c = ys + ky;
bool outside = outside_x || y_c < 0 || y_c >= input_shape.y; bool outside = outside_x || y_c < 0 || y_c >= input_shape.y;
r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (float4)(0.0f); r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (FLT4)(0.0f);
window_size += !outside ? 1.0f : 0.0f; window_size += !outside ? 1.0f : 0.0f;
} }
} }
float4 result = convert_float4(r / window_size); FLT4 result = TO_FLT4(r / window_size);
output[(output_shape.y * X + Y) * output_shape.w + Z] = result; output[(output_shape.y * X + Y) * output_shape.w + Z] = result;
} }
...@@ -43,10 +46,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d ...@@ -43,10 +46,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
return; return;
} }
float4 r = (float4)(0.0f); FLT4 r = (FLT4)(0.0f);
float window_size = 0.0f; FLT window_size = 0.0f;
int xs = X * stride.x + padding.x; int xs = X * stride.x - padding.x;
int ys = Y * stride.y + padding.y; int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) { for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky; int y_c = ys + ky;
...@@ -54,10 +57,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d ...@@ -54,10 +57,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
for (int kx = 0; kx < kernel_size.x; ++kx) { for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx; int x_c = xs + kx;
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)); r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (float4)(0.0f);
window_size += !outside ? 1.0f : 0.0f; window_size += !outside ? 1.0f : 0.0f;
} }
} }
float4 result = convert_float4(r / window_size); FLT4 result = TO_FLT4(r / window_size);
write_imagef(output, (int2)(Y * output_shape.w + Z, X), result); WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
} }
__kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, #ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate // axis to dst tensor coordinate
int X = get_global_id(0); int X = get_global_id(0);
...@@ -10,9 +13,9 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, ...@@ -10,9 +13,9 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
return; return;
} }
float4 maximum = (float4)(-10000.0f); FLT4 maximum = (FLT4)(-10000.0f);
int xs = X * stride.x + padding.x; int xs = X * stride.x - padding.x;
int ys = Y * stride.y + padding.y; int ys = Y * stride.y - padding.y;
for (int kx = 0; kx < kernel_size.x; ++kx) { for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx; int x_c = xs + kx;
...@@ -24,7 +27,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, ...@@ -24,7 +27,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
if (y_c < 0 || y_c >= input_shape.y) { if (y_c < 0 || y_c >= input_shape.y) {
continue; continue;
} }
float4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z]; FLT4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z];
maximum = max(src, maximum); maximum = max(src, maximum);
} }
} }
...@@ -45,18 +48,18 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d ...@@ -45,18 +48,18 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d
return; return;
} }
float4 maximum = (float4)(-10000.0f); FLT4 maximum = (FLT4)(-10000.0f);
int xs = X * stride.x + padding.x; int xs = X * stride.x - padding.x;
int ys = Y * stride.y + padding.y; int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) { for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky; int y_c = ys + ky;
if (y_c < 0 || y_c >= input_shape.y) continue; if (y_c < 0 || y_c >= input_shape.y) continue;
for (int kx = 0; kx < kernel_size.x; ++kx) { for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx; int x_c = xs + kx;
if (x_c < 0 || x_c >= input_shape.x) continue; if (x_c < 0 || x_c >= input_shape.x) continue;
float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
maximum = max(src, maximum); maximum = max(src, maximum);
} }
} }
write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum); WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum);
} }
...@@ -58,14 +58,13 @@ int MatMulOpenCLKernel::Init() { ...@@ -58,14 +58,13 @@ int MatMulOpenCLKernel::Init() {
sizeCO = {co, UP_DIV(co, C4NUM)}; sizeCO = {co, UP_DIV(co, C4NUM)};
PadWeight(); PadWeight();
in_ori_format_ = in_tensors_[0]->GetFormat(); in_ori_format_ = in_tensors_[0]->GetFormat();
in_tensors_[0]->SetFormat(schema::Format_NHWC4);
out_ori_format_ = out_tensors_[0]->GetFormat(); out_ori_format_ = out_tensors_[0]->GetFormat();
out_tensors_[0]->SetFormat(schema::Format_NHWC4);
if (out_tensors_[0]->shape().size() == 2) { if (out_tensors_[0]->shape().size() == 2) {
out_ori_format_ = schema::Format_NC;
out_tensors_[0]->SetFormat(schema::Format_NC4); out_tensors_[0]->SetFormat(schema::Format_NC4);
in_ori_format_ = schema::Format_NC;
in_tensors_[0]->SetFormat(schema::Format_NC4); in_tensors_[0]->SetFormat(schema::Format_NC4);
} else {
in_tensors_[0]->SetFormat(schema::Format_NHWC4);
out_tensors_[0]->SetFormat(schema::Format_NHWC4);
} }
MS_LOG(DEBUG) << kernel_name << " Init Done!"; MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK; return RET_OK;
......
...@@ -60,7 +60,7 @@ int PoolingOpenCLKernel::Init() { ...@@ -60,7 +60,7 @@ int PoolingOpenCLKernel::Init() {
return RET_INVALID_OP_NAME; return RET_INVALID_OP_NAME;
} }
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
enable_fp16_ = ocl_runtime->GetFp16Enable();
#ifdef PROGRAM_WITH_IL #ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name); kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
#else #else
...@@ -96,11 +96,10 @@ int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) ...@@ -96,11 +96,10 @@ int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
size_t im_dst_x, im_dst_y; size_t im_dst_x, im_dst_y;
im_dst_x = out_tensors_[0]->Width() * CO4; im_dst_x = out_tensors_[0]->Width() * CO4;
im_dst_y = out_tensors_[0]->Height(); im_dst_y = out_tensors_[0]->Height();
#ifdef ENABLE_FP16
size_t img_dtype = CL_HALF_FLOAT;
#else
size_t img_dtype = CL_FLOAT; size_t img_dtype = CL_FLOAT;
#endif if (enable_fp16_) {
img_dtype = CL_HALF_FLOAT;
}
img_size->clear(); img_size->clear();
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
*img_size = vec; *img_size = vec;
...@@ -161,5 +160,6 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::tensor: ...@@ -161,5 +160,6 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::tensor:
} }
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator) REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator)
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore
...@@ -44,6 +44,7 @@ class PoolingOpenCLKernel : public OpenCLKernel { ...@@ -44,6 +44,7 @@ class PoolingOpenCLKernel : public OpenCLKernel {
std::vector<size_t> InitGlobalSize() const; std::vector<size_t> InitGlobalSize() const;
PoolingParameter *parameter_; PoolingParameter *parameter_;
cl::Kernel kernel_; cl::Kernel kernel_;
bool enable_fp16_{false};
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
......
...@@ -50,6 +50,7 @@ class OpenCLKernel : public LiteKernel { ...@@ -50,6 +50,7 @@ class OpenCLKernel : public LiteKernel {
} }
OpenCLMemType GetMemType() { return out_mem_type_; } OpenCLMemType GetMemType() { return out_mem_type_; }
void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; } void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; }
void SetFormatType(schema::Format format_type) { op_format_ = format_type; }
schema::Format GetInOriFormat() { return in_ori_format_; } schema::Format GetInOriFormat() { return in_ori_format_; }
schema::Format GetOutOriFormat() { return out_ori_format_; } schema::Format GetOutOriFormat() { return out_ori_format_; }
...@@ -57,6 +58,7 @@ class OpenCLKernel : public LiteKernel { ...@@ -57,6 +58,7 @@ class OpenCLKernel : public LiteKernel {
OpenCLMemType out_mem_type_{OpenCLMemType::IMG}; OpenCLMemType out_mem_type_{OpenCLMemType::IMG};
schema::Format in_ori_format_{schema::Format_NHWC}; schema::Format in_ori_format_{schema::Format_NHWC};
schema::Format out_ori_format_{schema::Format_NHWC4}; schema::Format out_ori_format_{schema::Format_NHWC4};
schema::Format op_format_{schema::Format_NC4HW4};
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册