提交 8e442ce7 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!5635 [MS][LITE][GPU]fix bug in matmul and pooling

Merge pull request !5635 from chenzupeng/master-lite
__kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(0);
......@@ -10,10 +13,10 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
return;
}
float4 r = (float4)(0.0f);
float window_size = 0.0f;
int xs = X * stride.x + padding.x;
int ys = Y * stride.y + padding.y;
FLT4 r = (FLT4)(0.0f);
FLT window_size = 0.0f;
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
......@@ -21,11 +24,11 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
bool outside = outside_x || y_c < 0 || y_c >= input_shape.y;
r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (float4)(0.0f);
r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (FLT4)(0.0f);
window_size += !outside ? 1.0f : 0.0f;
}
}
float4 result = convert_float4(r / window_size);
FLT4 result = TO_FLT4(r / window_size);
output[(output_shape.y * X + Y) * output_shape.w + Z] = result;
}
......@@ -43,10 +46,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
return;
}
float4 r = (float4)(0.0f);
float window_size = 0.0f;
int xs = X * stride.x + padding.x;
int ys = Y * stride.y + padding.y;
FLT4 r = (FLT4)(0.0f);
FLT window_size = 0.0f;
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
......@@ -54,10 +57,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (float4)(0.0f);
window_size += !outside ? 1.0f : 0.0f;
}
}
float4 result = convert_float4(r / window_size);
write_imagef(output, (int2)(Y * output_shape.w + Z, X), result);
FLT4 result = TO_FLT4(r / window_size);
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
}
__kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(0);
......@@ -10,9 +13,9 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
return;
}
float4 maximum = (float4)(-10000.0f);
int xs = X * stride.x + padding.x;
int ys = Y * stride.y + padding.y;
FLT4 maximum = (FLT4)(-10000.0f);
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
......@@ -24,7 +27,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
if (y_c < 0 || y_c >= input_shape.y) {
continue;
}
float4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z];
FLT4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z];
maximum = max(src, maximum);
}
}
......@@ -45,18 +48,18 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d
return;
}
float4 maximum = (float4)(-10000.0f);
int xs = X * stride.x + padding.x;
int ys = Y * stride.y + padding.y;
FLT4 maximum = (FLT4)(-10000.0f);
int xs = X * stride.x - padding.x;
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
if (y_c < 0 || y_c >= input_shape.y) continue;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
if (x_c < 0 || x_c >= input_shape.x) continue;
float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
maximum = max(src, maximum);
}
}
write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum);
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum);
}
......@@ -58,14 +58,13 @@ int MatMulOpenCLKernel::Init() {
sizeCO = {co, UP_DIV(co, C4NUM)};
PadWeight();
in_ori_format_ = in_tensors_[0]->GetFormat();
in_tensors_[0]->SetFormat(schema::Format_NHWC4);
out_ori_format_ = out_tensors_[0]->GetFormat();
out_tensors_[0]->SetFormat(schema::Format_NHWC4);
if (out_tensors_[0]->shape().size() == 2) {
out_ori_format_ = schema::Format_NC;
out_tensors_[0]->SetFormat(schema::Format_NC4);
in_ori_format_ = schema::Format_NC;
in_tensors_[0]->SetFormat(schema::Format_NC4);
} else {
in_tensors_[0]->SetFormat(schema::Format_NHWC4);
out_tensors_[0]->SetFormat(schema::Format_NHWC4);
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
......
......@@ -60,7 +60,7 @@ int PoolingOpenCLKernel::Init() {
return RET_INVALID_OP_NAME;
}
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
enable_fp16_ = ocl_runtime->GetFp16Enable();
#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
#else
......@@ -96,11 +96,10 @@ int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
size_t im_dst_x, im_dst_y;
im_dst_x = out_tensors_[0]->Width() * CO4;
im_dst_y = out_tensors_[0]->Height();
#ifdef ENABLE_FP16
size_t img_dtype = CL_HALF_FLOAT;
#else
size_t img_dtype = CL_FLOAT;
#endif
if (enable_fp16_) {
img_dtype = CL_HALF_FLOAT;
}
img_size->clear();
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
*img_size = vec;
......@@ -161,5 +160,6 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::tensor:
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator)
} // namespace kernel
} // namespace mindspore
......@@ -44,6 +44,7 @@ class PoolingOpenCLKernel : public OpenCLKernel {
std::vector<size_t> InitGlobalSize() const;
PoolingParameter *parameter_;
cl::Kernel kernel_;
bool enable_fp16_{false};
};
} // namespace mindspore::kernel
......
......@@ -50,6 +50,7 @@ class OpenCLKernel : public LiteKernel {
}
OpenCLMemType GetMemType() { return out_mem_type_; }
void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; }
void SetFormatType(schema::Format format_type) { op_format_ = format_type; }
schema::Format GetInOriFormat() { return in_ori_format_; }
schema::Format GetOutOriFormat() { return out_ori_format_; }
......@@ -57,6 +58,7 @@ class OpenCLKernel : public LiteKernel {
OpenCLMemType out_mem_type_{OpenCLMemType::IMG};
schema::Format in_ori_format_{schema::Format_NHWC};
schema::Format out_ori_format_{schema::Format_NHWC4};
schema::Format op_format_{schema::Format_NC4HW4};
};
} // namespace mindspore::kernel
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册