diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl index a6e1c322e12f014ea8b679f722f8ec06085b6b84..17afaae4a95a4c14ca2d8c0e872d147f7ce8357b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl @@ -1,189 +1,186 @@ -__constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, - float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size, + float relu_clip, __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - int x_offseted = X * stride.x + padding.x; - int y_offseted = Y * stride.y + padding.y; + int x_offset = X * stride.x + padding.x; + int y_offset = Y * stride.y + padding.y; int fx_c = Z * kernel_size.x * kernel_size.y; for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = y_offseted + ky * dilation.y; + int y_c = y_offset + ky * dilation.y; bool outside_y = y_c < 0 || y_c >= src_size.y; for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = x_offseted + kx * dilation.x; + int x_c = x_offset + kx * dilation.x; bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { - FLT4 f = filter[fx_c]; - // FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))]; - FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(x_c, (Z * src_size.y + y_c))); - r += TO_FLT4(src_final * f); + FLT4 flt_p = filter[fx_c]; + FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(x_c, (Z * src_size.y + y_c))); + r += TO_FLT4(src_p * flt_p); } fx_c++; } } - FLT4 bias_val = bias[Z]; - FLT4 res0 = TO_FLT4(r) + bias_val; - res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - // dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0; - write_imagef(dst_data, (int2)(X, (Z * dst_size.y + Y)), res0); + FLT4 bias_p = bias[Z]; + FLT4 res = TO_FLT4(r) + bias_p; + res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip)); + WRITE_IMAGE(dst_data, (int2)(X, (Z * dst_size.y + Y)), res); } __kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, - float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size, + float relu_clip, __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - int x_offseted = X * stride.x + padding.x; - int y_offseted = Y * stride.y + padding.y; + int x_offset = X * stride.x + padding.x; + int y_offset = Y * stride.y + padding.y; int fx_c = Z * kernel_size.x * kernel_size.y; for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = y_offseted + ky * dilation.y; + int y_c = y_offset + ky * dilation.y; bool outside_y = y_c < 0 || y_c >= src_size.y; for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = x_offseted + kx * dilation.x; + int x_c = x_offset + kx * dilation.x; bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { - FLT4 f = filter[fx_c]; - // FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; - FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z + x_c * src_size.z, y_c)); - r += TO_FLT4(src_final * f); + FLT4 flt_p = filter[fx_c]; + FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c)); + r += TO_FLT4(src_p * flt_p); } fx_c++; } } - FLT4 bias_val = bias[Z]; - FLT4 res0 = TO_FLT4(r) + bias_val; - res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; - write_imagef(dst_data, (int2)(X * dst_size.z + Z, Y), res0); + FLT4 bias_p = bias[Z]; + FLT4 res = TO_FLT4(r) + bias_p; + res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip)); + WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), res); } __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, - float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size, + float relu_clip, __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - int x_offseted = X * stride.x + padding.x; - int y_offseted = Y * stride.y + padding.y; + int x_offset = X * stride.x + padding.x; + int y_offset = Y * stride.y + padding.y; int fx_c = Z; { - int y_c = y_offseted; + int y_c = y_offset; bool outside_y = y_c < 0 || y_c >= src_size.y; { - int x_c = x_offseted; + int x_c = x_offset; bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { - FLT4 f = filter[fx_c]; - // FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; - FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z)); - r += TO_FLT4(src_final * f); + FLT4 flt_p = filter[fx_c]; + // FLT4 src_p =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; + FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z)); + r += TO_FLT4(src_p * flt_p); } } } - FLT4 bias_val = bias[Z]; - FLT4 res0 = TO_FLT4(r) + bias_val; - res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; - write_imagef(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res0); + FLT4 bias_p = bias[Z]; + FLT4 res = TO_FLT4(r) + bias_p; + res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip)); + // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res; + WRITE_IMAGE(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res); } __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, - float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride, + float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - int x_offseted = X * stride.x + padding.x; - int y_offseted = Y * stride.y + padding.y; + int x_offset = X * stride.x + padding.x; + int y_offset = Y * stride.y + padding.y; int fx_c = Z * kernel_size.x * kernel_size.y; for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = y_offseted + ky * dilation.y; + int y_c = y_offset + ky * dilation.y; bool outside_y = y_c < 0 || y_c >= src_size.y; for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = x_offseted + kx * dilation.x; + int x_c = x_offset + kx * dilation.x; bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { - FLT4 f = filter[fx_c]; - FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))]; - r += TO_FLT4(src_final * f); + FLT4 flt_p = filter[fx_c]; + FLT4 src_p = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))]; + r += TO_FLT4(src_p * flt_p); } fx_c++; } } - FLT4 bias_val = bias[Z]; - FLT4 res0 = TO_FLT4(r) + bias_val; - res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0; + FLT4 bias_p = bias[Z]; + FLT4 res = TO_FLT4(r) + bias_p; + res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip)); + dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res; } __kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, - float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride, + float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - int x_offseted = X * stride.x + padding.x; - int y_offseted = Y * stride.y + padding.y; + int x_offset = X * stride.x + padding.x; + int y_offset = Y * stride.y + padding.y; int fx_c = Z * kernel_size.x * kernel_size.y; for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = y_offseted + ky * dilation.y; + int y_c = y_offset + ky * dilation.y; bool outside_y = y_c < 0 || y_c >= src_size.y; for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = x_offseted + kx * dilation.x; + int x_c = x_offset + kx * dilation.x; bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { - FLT4 f = filter[fx_c]; - FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; - r += TO_FLT4(src_final * f); + FLT4 flt_p = filter[fx_c]; + FLT4 src_p = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; + r += TO_FLT4(src_p * flt_p); } fx_c++; } } - FLT4 bias_val = bias[Z]; - FLT4 res0 = TO_FLT4(r) + bias_val; - res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; + FLT4 bias_p = bias[Z]; + FLT4 res = TO_FLT4(r) + bias_p; + res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip)); + dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res; } __kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, - float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride, + float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - int x_offseted = X * stride.x + padding.x; - int y_offseted = Y * stride.y + padding.y; + int x_offset = X * stride.x + padding.x; + int y_offset = Y * stride.y + padding.y; int fx_c = Z; { - int y_c = y_offseted; + int y_c = y_offset; bool outside_y = y_c < 0 || y_c >= src_size.y; { - int x_c = x_offseted; + int x_c = x_offset; bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { - FLT4 f = filter[fx_c]; - FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; - r += TO_FLT4(src_final * f); + FLT4 flt_p = filter[fx_c]; + FLT4 src_p = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; + r += TO_FLT4(src_p * flt_p); } } } - FLT4 bias_val = bias[Z]; - FLT4 res0 = TO_FLT4(r) + bias_val; - res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; + FLT4 bias_p = bias[Z]; + FLT4 res = TO_FLT4(r) + bias_p; + res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip)); + dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl index fecdcb488743b369167d3b4c4364c91719c656ed..bc08a62c2b2f01b9d826ebf28e37ac1e2c6461b7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl @@ -1,15 +1,5 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} __kernel void to_format_NHWC_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, int4 shape) { int X = get_global_id(0); @@ -47,58 +37,17 @@ __kernel void to_format_NHWC4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only } WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(X * size.y + Y) * size.z + Z]); } -__kernel void to_format_NC4HW4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} -__kernel void to_format_NCHW_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} -__kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} -__kernel void to_format_NHWC4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} __kernel void to_format_NC4HW4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); + // size(h, w, c4, 1), shape(n, c, h, w) + int X = get_global_id(0); // h + int Y = get_global_id(1); // w + int Z = get_global_id(2); // c4 if (X >= size.x || Y >= size.y || Z >= size.z) { return; } - // FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))]; - WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(Y * size.z + Z) * size.x + X]); + WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), src_data[(Z * size.x + X) * size.y + Y]); } - __kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, int4 shape) { int X = get_global_id(0); @@ -109,56 +58,6 @@ __kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __globa } dst_data[(Z * size.y + Y) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z)); } -__kernel void to_format_NHWC_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} -__kernel void to_format_NHWC4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} -__kernel void to_format_NC4HW4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} -__kernel void to_format_NCHW_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} -__kernel void to_format_NHWC_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} __kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, int4 shape) { int X = get_global_id(0); @@ -185,25 +84,16 @@ __kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __glob } } } -__kernel void to_format_NC4HW4_to_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} __kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); + // size(h, w, c, 1), shape(n, c, h, w) + int X = get_global_id(0); // h + int Y = get_global_id(1); // w + int Z = get_global_id(2); // c if (X >= size.x || Y >= size.y || Z >= size.z) { return; } - dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)); + dst_data[(Z * size.x + X) * size.y + Y] = READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X)); } __kernel void to_format_NHWC4_to_NHWC4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, int4 shape) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc index c3a9f681bc9315df3af1759a5f05cac58796f7ff..06e833c692e67b4656a2fd5a3d631a6803c81ca9 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc @@ -20,9 +20,10 @@ #include <utility> #include "src/kernel_registry.h" #include "src/runtime/opencl/opencl_runtime.h" -#include "src/runtime/kernel/arm/fp32/convolution_depthwise.h" +#include "src/runtime/kernel/opencl/utils.h" +#include "nnacl/fp32/common_func.h" +#include "nnacl/op_base.h" #include "include/errorcode.h" -#include "nnacl/pack.h" #ifndef PROGRAM_WITH_IL @@ -81,30 +82,50 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() { auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto allocator = ocl_runtime->GetAllocator(); + bool is_fp16 = ocl_runtime->GetFp16Enable(); // weight: o, h, w, i; o == group, i == 1 - auto origin_weight = reinterpret_cast<FLOAT_t *>(in_tensors_.at(kWeightIndex)->Data()); + void *origin_weight = in_tensors_.at(kWeightIndex)->Data(); int CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); int pack_weight_size = C4NUM * CO4 * parameter->kernel_h_ * parameter->kernel_w_; - packed_weight_ = reinterpret_cast<FLOAT_t *>(allocator->Malloc(pack_weight_size * sizeof(FLOAT_t))); - packed_weight_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true)); int plane = parameter->kernel_h_ * parameter->kernel_w_; -#ifdef ENABLE_FP16 - PackNCHWToNC4HW4Fp16(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel()); -#else - PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel()); -#endif + if (is_fp16) { + packed_weight_ = allocator->Malloc(pack_weight_size * sizeof(int16_t)); + packed_weight_ = allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); + if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) { + std::function<int16_t(int16_t)> to_dtype = [](int16_t x) -> int16_t { return x; }; + PackNCHWToNC4HW4<int16_t, int16_t>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype); + } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) { + std::function<int16_t(float)> to_dtype = Float32ToShort; + PackNCHWToNC4HW4<float, int16_t>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype); + } else { + MS_LOG(ERROR) << "Only support float16/float32, actual data type " << in_tensors_.at(kWeightIndex)->data_type(); + } + } else { + packed_weight_ = allocator->Malloc(pack_weight_size * sizeof(float)); + packed_weight_ = allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); + if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) { + std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; }; + PackNCHWToNC4HW4<float, float>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype); + } else { + MS_LOG(ERROR) << "Only support float16/float32, actual data type " << in_tensors_.at(kWeightIndex)->data_type(); + } + } allocator->UnmapBuffer(packed_weight_); if (in_tensors_.size() == kInputSize2) { - bias_data_ = reinterpret_cast<FLOAT_t *>(allocator->Malloc(C4NUM * CO4 * sizeof(FLOAT_t))); - bias_data_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(bias_data_, CL_MAP_WRITE, nullptr, true)); - size_t up_co_size = C4NUM * CO4 * sizeof(FLOAT_t); + size_t dtype_size = sizeof(float); + if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) { + dtype_size = sizeof(int16_t); + } + bias_data_ = allocator->Malloc(C4NUM * CO4 * dtype_size); + bias_data_ = allocator->MapBuffer(bias_data_, CL_MAP_WRITE, nullptr, true); + size_t up_co_size = C4NUM * CO4 * dtype_size; memset(bias_data_, 0, up_co_size); - auto ori_bias = reinterpret_cast<FLOAT_t *>(in_tensors_.at(kBiasIndex)->Data()); - memcpy(bias_data_, ori_bias, out_tensors_[0]->Channel() * sizeof(FLOAT_t)); + auto ori_bias = in_tensors_.at(kBiasIndex)->Data(); + memcpy(bias_data_, ori_bias, out_tensors_[0]->Channel() * dtype_size); allocator->UnmapBuffer(bias_data_); } else { MS_ASSERT(in_tensors_.size() == kInputSize1); @@ -124,11 +145,10 @@ int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *i im_dst_y = out_tensors_[0]->Height() * CO4; im_dst_x = out_tensors_[0]->Width(); } -#ifdef ENABLE_FP16 - size_t img_dtype = CL_HALF_FLOAT; -#else size_t img_dtype = CL_FLOAT; -#endif + if (lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable()) { + img_dtype = CL_HALF_FLOAT; + } img_size->clear(); std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; *img_size = vec; @@ -204,5 +224,6 @@ kernel::LiteKernel *OpenCLDepthwiseConv2dKernelCreator(const std::vector<lite::t return kernel; } +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DepthwiseConv2D, OpenCLDepthwiseConv2dKernelCreator) REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, OpenCLDepthwiseConv2dKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h index 22c167b964b4809d3190b5b9cad2d833b0e3aea0..6564054483b6e480db6b28ebd3741cdf187688f7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h @@ -20,7 +20,6 @@ #include <vector> #include "src/runtime/kernel/opencl/opencl_kernel.h" #include "nnacl/conv_parameter.h" - #include "src/runtime/opencl/opencl_runtime.h" namespace mindspore::kernel { @@ -46,8 +45,8 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override; private: - FLOAT_t *packed_weight_; - FLOAT_t *bias_data_; + void *packed_weight_; + void *bias_data_; cl::Kernel kernel_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc index 7c4dd4546d9166298f9070c21ff327357755d72a..01f4572dbd7687d2d951925b61a92afd86e1960b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc @@ -172,5 +172,6 @@ kernel::LiteKernel *OpenCLToFormatKernelCreator(const std::vector<lite::tensor:: return kernel; } +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator) REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc index 4b19d71dc7bca37eedc3e17f8c20fa77d7222d91..bf269120f2edf6cb55a7c5091f52ff3818e6a390 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc @@ -93,11 +93,10 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::tensor::Tensor * } out_tensors->emplace_back(new_tensor); -#ifdef ENABLE_FP16 - KernelKey desc{kGPU, kNumberTypeFloat16, schema::PrimitiveType_ToFormat}; -#else KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat}; -#endif + if (lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable()) { + desc.data_type = kNumberTypeFloat16; + } OpenCLToFormatParameter *parameter = new (std::nothrow) OpenCLToFormatParameter; MS_ASSERT(parameter); if (parameter == nullptr) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h index 07a87547bb9aef131ee5e9238c1d7cb1de9631b6..c498b65c3ff192c0a7773d101f6ecead0ce1fef0 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.h +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h @@ -23,6 +23,7 @@ #include "utils/log_adapter.h" #include "nnacl/op_base.h" #include "src/lite_kernel.h" +#include "src/common//utils.h" namespace mindspore::lite { kernel::LiteKernel *GetOpenCLKernel(const std::vector<tensor::Tensor *> &in_tensors, @@ -89,6 +90,73 @@ std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int ma std::string CLErrorCode(cl_int error_code); +template <class T1, class T2> +void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, + const std::function<T2(T1)> &to_dtype) { + int c4 = UP_DIV(channel, C4NUM); + for (int b = 0; b < batch; b++) { + int src_offset = b * plane * channel; + int dst_offset = b * plane * c4 * C4NUM; + for (int c = 0; c < channel; c++) { + int c4_block_num = c / C4NUM; + int c4_block_rem = c % C4NUM; + int src_c_offset = src_offset + c * plane; + int dst_c_offset = dst_offset + c4_block_num * plane * C4NUM; + for (int k = 0; k < plane; k++) { + int src_kernel_offset = src_c_offset + k; + int dst_kernel_offset = dst_c_offset + C4NUM * k + c4_block_rem; + (static_cast<T2 *>(dst) + dst_kernel_offset)[0] = + to_dtype((static_cast<T1 *>(src) + src_kernel_offset)[0]); + } + } + } +} +template <class T1, class T2> +void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, + const std::function<T2(T1)> &to_dtype) { + int c4 = UP_DIV(channel, C4NUM); + int nhwc4_batch_unit_offset = c4 * C4NUM * plane; + int ic_remainder_ = channel % C4NUM; + if (ic_remainder_ != 0) { + int nhwc4_batch_offset = 0; + for (int b = 0; b < batch; b++) { + int batch_offset = b * channel * plane; + for (int i = 0; i < plane; ++i) { + for (int c = 0; c < channel; ++c) { + (static_cast<T2 *>(dst) + nhwc4_batch_offset + i * c4 * C4NUM + c)[0] = + to_dtype((static_cast<T1 *>(src) + batch_offset + i * channel + c)[0]); + } + } + nhwc4_batch_offset += nhwc4_batch_unit_offset; + } + } else { + size_t ori_input_size = batch * plane * channel; + for (size_t n = 0; n < ori_input_size; ++n) { + (static_cast<T2 *>(dst) + n)[0] = to_dtype((static_cast<T1 *>(src) + n)[0]); + } + } +} +template <class T1, class T2> +void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, + const std::function<T2(T1)> &to_dtype) { + int c4 = UP_DIV(channel, C4NUM); + for (int b = 0; b < batch; b++) { + int src_oc_offset = b * plane * channel; + int dst_oc_offset = b * plane * c4 * C4NUM; + for (int k = 0; k < plane; k++) { + int src_kernel_offset = src_oc_offset + k * channel; + int dst_kernel_offset = dst_oc_offset + k * C4NUM; + for (int i = 0; i < channel; i++) { + int c4_block_num = i / C4NUM; + int c4_block_rem = i % C4NUM; + int src_ic_offset = src_kernel_offset + i; + int dst_ic_offset = dst_kernel_offset + c4_block_num * plane * C4NUM + c4_block_rem; + (static_cast<T2 *>(dst) + dst_ic_offset)[0] = to_dtype((static_cast<T1 *>(src) + src_ic_offset)[0]); + } + } + } +} + } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_UTILS_H_ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc index a88a23a5c25a30e160b1ac0896926b32d86538e9..3fb61f1d55c1269b69794d869fa42e8f31968583 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc @@ -19,20 +19,20 @@ #include "common/common_test.h" #include "mindspore/lite/src/common/file_utils.h" #include "nnacl/pack.h" +#include "src/runtime/kernel/opencl/utils.h" #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" #include "mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h" -bool IMAGE2D_OPEN = true; - namespace mindspore { class TestConvolutionDwOpenCL : public mindspore::CommonTest { public: TestConvolutionDwOpenCL() {} }; -void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t *weight_data, float_t *gnd_data, - schema::Format format, bool is_compare = true) { +template <class T1, class T2> +void DepthWiseTestMain(ConvParameter *conv_param, T2 *input_data, T1 *weight_data, T2 *gnd_data, schema::Format format, + TypeId dtype = kNumberTypeFloat32, bool is_compare = true, T2 err_max = 1e-5) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); auto allocator = ocl_runtime->GetAllocator(); @@ -40,45 +40,54 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t * // pack input int IC4 = UP_DIV(conv_param->input_channel_, C4NUM); int pack_input_size = C4NUM * IC4 * conv_param->input_h_ * conv_param->input_w_; - auto packed_input = std::make_unique<float>(pack_input_size); - if (packed_input.get() == nullptr) { + auto packed_input = new (std::nothrow) T2[pack_input_size]; + if (packed_input == nullptr) { return; } - memset(packed_input.get(), 0, pack_input_size * sizeof(float)); + memset(packed_input, 0, pack_input_size * sizeof(T2)); int plane = conv_param->input_w_ * conv_param->input_h_; + std::function<T2(T2)> to_dtype = [](T2 x) -> T2 { return x; }; if (format == schema::Format_NHWC4) { - PackNHWCToNHWC4Fp32(input_data, packed_input.get(), 1, plane, conv_param->input_channel_); + kernel::PackNHWCToNHWC4<T2, T2>(input_data, packed_input, 1, plane, conv_param->input_channel_, to_dtype); } else { - PackNHWCToNC4HW4Fp32(input_data, packed_input.get(), 1, plane, conv_param->input_channel_); + kernel::PackNHWCToNC4HW4<T2, T2>(input_data, packed_input, 1, plane, conv_param->input_channel_, to_dtype); } // pack weight int OC4 = UP_DIV(conv_param->output_channel_, C4NUM); int pack_weight_size = conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_; - float *packed_weight = weight_data; + T1 *packed_weight = weight_data; - // float bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0}; - float bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + // T1 bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0}; + T1 bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; size_t packed_output_size = conv_param->output_batch_ * C4NUM * UP_DIV(conv_param->output_channel_, C4NUM) * conv_param->output_h_ * conv_param->output_w_; - std::vector<int> shape_in = {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_, - conv_param->input_channel_}; // Note!!!actual is NHWC4 std::vector<int> shape_filter = {1, conv_param->kernel_h_, conv_param->kernel_w_, conv_param->output_channel_}; std::vector<int> shape_bias = {conv_param->output_channel_}; - std::vector<int> shape_out = {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_, - conv_param->output_channel_}; - auto tensor_a = - std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_in, format); // Note!!!actual is NHWC4 - auto tensor_b = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_filter, schema::Format_NHWC); - auto tensor_c = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_bias, schema::Format_NHWC); - auto tensor_d = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_out, format); - std::vector<lite::tensor::Tensor *> inputs{tensor_a.get(), tensor_b.get(), tensor_c.get()}; - std::vector<lite::tensor::Tensor *> outputs{tensor_d.get()}; - if (tensor_a.get() == nullptr || tensor_b.get() == nullptr || tensor_c.get() == nullptr || - tensor_d.get() == nullptr) { + std::vector<int> shape_out; + std::vector<int> shape_in; + if (format == schema::Format_NHWC || format == schema::Format_NHWC4) { + shape_in = std::vector<int>( + {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_, conv_param->input_channel_}); + shape_out = std::vector<int>( + {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_, conv_param->output_channel_}); + } else if (format == schema::Format_NCHW || format == schema::Format_NC4HW4) { + shape_in = std::vector<int>( + {conv_param->input_batch_, conv_param->input_channel_, conv_param->input_h_, conv_param->input_w_}); + shape_out = std::vector<int>( + {conv_param->output_batch_, conv_param->output_channel_, conv_param->output_h_, conv_param->output_w_}); + } else { + MS_LOG(ERROR) << "Unsupported format: " << format; + delete[] packed_input; return; } + auto tensor_a = lite::tensor::Tensor(TypeId(dtype), shape_in, format); + auto tensor_b = lite::tensor::Tensor(TypeId(dtype), shape_filter, schema::Format_NHWC); + auto tensor_c = lite::tensor::Tensor(TypeId(dtype), shape_bias, schema::Format_NHWC); + auto tensor_d = lite::tensor::Tensor(TypeId(dtype), shape_out, format); + std::vector<lite::tensor::Tensor *> inputs{&tensor_a, &tensor_b, &tensor_c}; + std::vector<lite::tensor::Tensor *> outputs{&tensor_d}; // freamework to do!!! inputs[1]->SetData(packed_weight); @@ -87,43 +96,48 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t * OpParameter *parameter = reinterpret_cast<OpParameter *>(conv_param); auto pKernel = std::make_unique<kernel::DepthwiseConv2dOpenCLKernel>(parameter, inputs, outputs); if (pKernel.get() == nullptr) { + delete[] packed_input; return; } pKernel->Init(); std::vector<kernel::LiteKernel *> kernels{pKernel.get()}; - std::vector<lite::tensor::Tensor *> inputs_{tensor_a.get()}; + std::vector<lite::tensor::Tensor *> inputs_{&tensor_a}; size_t C4 = UP_DIV(inputs[0]->Channel(), C4NUM); - inputs[0]->MallocData(allocator); auto pGraph = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_, outputs, kernels, kernels, kernels); - if (pKernel.get() == nullptr) { + if (pGraph.get() == nullptr) { + delete[] packed_input; return; } pGraph->Init(); // freamework to do!!! - memcpy(inputs[0]->Data(), packed_input.get(), sizeof(float) * pack_input_size); + inputs[0]->MallocData(allocator); + memcpy(inputs[0]->Data(), packed_input, sizeof(T2) * pack_input_size); pGraph->Run(); if (is_compare) { - float_t *packed_output = reinterpret_cast<float *>(outputs[0]->Data()); - auto packed_correct_data = std::make_unique<float_t>(packed_output_size); - if (packed_correct_data) { + T2 *packed_output = reinterpret_cast<T2 *>(outputs[0]->Data()); + auto packed_correct_data = std::make_unique<T2>(packed_output_size); + if (packed_correct_data.get() == nullptr) { + delete[] packed_input; return; } - memset(packed_correct_data.get(), 0, packed_output_size * sizeof(float_t)); + memset(packed_correct_data.get(), 0, packed_output_size * sizeof(T2)); if (format == schema::Format_NC4HW4) { - PackNHWCToNC4HW4Fp32(gnd_data, packed_correct_data.get(), conv_param->output_batch_, - conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_); + kernel::PackNHWCToNC4HW4<T2, T2>(gnd_data, packed_correct_data.get(), conv_param->output_batch_, + conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_, + to_dtype); } else { - PackNHWCToNHWC4Fp32(gnd_data, packed_correct_data.get(), conv_param->output_batch_, - conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_); + kernel::PackNHWCToNHWC4<T2, T2>(gnd_data, packed_correct_data.get(), conv_param->output_batch_, + conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_, + to_dtype); } printf("==================input_data=================\n"); std::cout << std::endl; for (int i = 0; i < pack_input_size; i++) { - std::cout << packed_input.get()[i] << ", "; + std::cout << packed_input[i] << ", "; } std::cout << std::endl; printf("==================weight data=================\n"); @@ -134,7 +148,7 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t * std::cout << std::endl; printf("==================output data=================\n"); std::cout << std::endl; - for (int i = 0; i < 80 /*packed_output_size*/; i++) { + for (int i = 0; i < packed_output_size; i++) { std::cout << packed_output[i] << ", "; } std::cout << std::endl; @@ -144,11 +158,12 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t * } std::cout << std::endl; // compare - CommonTest::CompareOutputData(packed_output, packed_correct_data.get(), packed_output_size, 0.00001); + CommonTest::CompareOutputData<T2>(packed_output, packed_correct_data.get(), packed_output_size, err_max); } inputs[1]->SetData(nullptr); inputs[2]->SetData(nullptr); + delete[] packed_input; return; } @@ -194,7 +209,7 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) { float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022, 1.1872686, 2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988}; - DepthWiseTestMain(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4); + DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4); } TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) { @@ -266,7 +281,7 @@ TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) { 0.8749627, 0.8953936, 0.5093431, 1.5496738, 0.54936385, 0.7683113, 1.165742, 1.3682933, 1.0517888, 0.59817517, 0.75649744, 1.2075498, 0.38804203}; - DepthWiseTestMain(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4); + DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4); } TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) { @@ -311,7 +326,7 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) { float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022, 1.1872686, 2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988}; - DepthWiseTestMain(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4); + DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4); } TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) { @@ -383,12 +398,10 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) { 0.8749627, 0.8953936, 0.5093431, 1.5496738, 0.54936385, 0.7683113, 1.165742, 1.3682933, 1.0517888, 0.59817517, 0.75649744, 1.2075498, 0.38804203}; - DepthWiseTestMain(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4); + DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4); } -TEST_F(TestConvolutionDwOpenCL, ConvDwNoPadFp32) { - auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - ocl_runtime->Init(); +TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp16) { auto conv_param = std::make_unique<ConvParameter>(); { conv_param->input_batch_ = 1; @@ -410,109 +423,33 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwNoPadFp32) { } // nhwc - float input_data[] = {0.5488135, 0.0202184, 0.45615032, 0.31542835, 0.71518934, 0.83261985, 0.56843394, 0.36371076, - 0.60276335, 0.77815676, 0.0187898, 0.57019675, 0.5448832, 0.87001216, 0.6176355, 0.43860152, - 0.4236548, 0.9786183, 0.6120957, 0.9883738, 0.6458941, 0.7991586, 0.616934, 0.10204481, - 0.4375872, 0.46147937, 0.94374806, 0.20887676, 0.891773, 0.7805292, 0.6818203, 0.16130951, - 0.96366274, 0.11827443, 0.3595079, 0.6531083, 0.3834415, 0.639921, 0.43703195, 0.2532916, - 0.79172504, 0.14335328, 0.6976312, 0.46631077, 0.5288949, 0.9446689, 0.06022547, 0.2444256, - 0.56804454, 0.5218483, 0.6667667, 0.15896958, 0.92559665, 0.41466194, 0.67063785, 0.11037514, - 0.07103606, 0.2645556, 0.21038257, 0.6563296, 0.0871293, 0.7742337, 0.12892629, 0.13818295}; - - // pack input - int IC4 = UP_DIV(conv_param->input_channel_, C4NUM); - int pack_input_size = C4NUM * IC4 * conv_param->input_h_ * conv_param->input_w_; - float *packed_input = input_data; + float16_t input_data[] = { + 0.5488135, 0.0202184, 0.45615032, 0.31542835, 0.71518934, 0.83261985, 0.56843394, 0.36371076, + 0.60276335, 0.77815676, 0.0187898, 0.57019675, 0.5448832, 0.87001216, 0.6176355, 0.43860152, + 0.4236548, 0.9786183, 0.6120957, 0.9883738, 0.6458941, 0.7991586, 0.616934, 0.10204481, + 0.4375872, 0.46147937, 0.94374806, 0.20887676, 0.891773, 0.7805292, 0.6818203, 0.16130951, + 0.96366274, 0.11827443, 0.3595079, 0.6531083, 0.3834415, 0.639921, 0.43703195, 0.2532916, + 0.79172504, 0.14335328, 0.6976312, 0.46631077, 0.5288949, 0.9446689, 0.06022547, 0.2444256, + 0.56804454, 0.5218483, 0.6667667, 0.15896958, 0.92559665, 0.41466194, 0.67063785, 0.11037514, + 0.07103606, 0.2645556, 0.21038257, 0.6563296, 0.0871293, 0.7742337, 0.12892629, 0.13818295}; // co h w ci - float weight_data[] = {0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449, 0.09609841, 0.97645944, 0.4686512, - 0.9767611, 0.6048455, 0.7392636, 0.03918779, 0.28280696, 0.12019656, 0.2961402, 0.11872772, - 0.31798318, 0.41426298, 0.06414749, 0.6924721, 0.56660146, 0.2653895, 0.5232481, 0.09394051, - 0.5759465, 0.9292962, 0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.18319136, - 0.5865129, 0.02010755, 0.82894003, 0.00469548}; - - // pack weight - int OC4 = UP_DIV(conv_param->output_channel_, C4NUM); - int pack_weight_size = C4NUM * OC4 * conv_param->kernel_h_ * conv_param->kernel_w_; - float *packed_weight = weight_data; - - // float bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0}; - float bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - size_t packed_output_size = conv_param->output_batch_ * C4NUM * UP_DIV(conv_param->output_channel_, C4NUM) * - conv_param->output_h_ * conv_param->output_w_; - - std::vector<int> shape_in = {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_, - IC4 * C4NUM}; // Note!!!actual is NHWC4 - std::vector<int> shape_filter = {1, conv_param->kernel_h_, conv_param->kernel_w_, conv_param->output_channel_}; - std::vector<int> shape_bias = {conv_param->output_channel_}; - std::vector<int> shape_out = {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_, - conv_param->output_channel_}; - auto tensor_a = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_in, - schema::Format_NC4HW4); // Note!!!actual is NHWC4 - auto tensor_b = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_filter, schema::Format_NHWC); - auto tensor_c = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_bias, schema::Format_NHWC); - auto tensor_d = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_out, schema::Format_NC4HW4); - std::vector<lite::tensor::Tensor *> inputs{tensor_a.get(), tensor_b.get(), tensor_c.get()}; - std::vector<lite::tensor::Tensor *> outputs{tensor_d.get()}; - - // freamework to do!!! - inputs[1]->SetData(packed_weight); - inputs[2]->SetData(bias_data); - - OpParameter *parameter = reinterpret_cast<OpParameter *>(conv_param.get()); - auto pKernel = std::make_unique<kernel::DepthwiseConv2dOpenCLKernel>(parameter, inputs, outputs); - pKernel->Init(); - - std::vector<kernel::LiteKernel *> kernels{pKernel.get()}; - std::vector<lite::tensor::Tensor *> inputs_{tensor_a.get()}; - inputs[0]->MallocData(); - auto pGraph = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_, outputs, kernels, kernels, kernels); - pGraph->Init(); - - // freamework to do!!! - memcpy(inputs[0]->Data(), packed_input, sizeof(float) * pack_input_size); - - pGraph->Run(); - float *packed_output = reinterpret_cast<float *>(outputs[0]->Data()); + float16_t weight_data[] = { + 0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449, 0.09609841, 0.97645944, 0.4686512, 0.9767611, + 0.6048455, 0.7392636, 0.03918779, 0.28280696, 0.12019656, 0.2961402, 0.11872772, 0.31798318, 0.41426298, + 0.06414749, 0.6924721, 0.56660146, 0.2653895, 0.5232481, 0.09394051, 0.5759465, 0.9292962, 0.31856894, + 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.18319136, 0.5865129, 0.02010755, 0.82894003, 0.00469548}; // pack correct data, nhwc - float packed_correct_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022, 1.1872686, - 2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988}; + float16_t gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022, 1.1872686, + 2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988}; - printf("==================input_data=================\n"); - std::cout << std::endl; - for (int i = 0; i < pack_input_size; i++) { - std::cout << packed_input[i] << ", "; - } - std::cout << std::endl; - printf("==================packed_weight data=================\n"); - std::cout << std::endl; - for (int i = 0; i < pack_weight_size; i++) { - std::cout << packed_weight[i] << ", "; - } - std::cout << std::endl; - printf("==================output data=================\n"); - std::cout << std::endl; - for (int i = 0; i < packed_output_size; i++) { - std::cout << packed_output[i] << ", "; - } - std::cout << std::endl; - printf("==================expected output data=================\n"); - for (int i = 0; i < packed_output_size; i++) { - std::cout << packed_correct_data[i] << ", "; - } - std::cout << std::endl; - // compare - CommonTest::CompareOutputData(packed_output, packed_correct_data, packed_output_size, 0.00001); - - inputs[1]->SetData(nullptr); - inputs[2]->SetData(nullptr); - MS_LOG(INFO) << "TestConvolutionDwNoPadFp32 passed"; + lite::opencl::OpenCLRuntime::GetInstance()->SetFp16Enable(true); + DepthWiseTestMain<float16_t, float16_t>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4, + kNumberTypeFloat16, true, 1e-2); } -TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) { - auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - ocl_runtime->Init(); +TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp16) { auto conv_param = std::make_unique<ConvParameter>(); { conv_param->input_batch_ = 1; @@ -534,13 +471,13 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) { } // nhwc - float input_data[] = {0.5488135, 0.3834415, 0.77815676, 0.9446689, 0.6120957, 0.71518934, 0.79172504, 0.87001216, - 0.5218483, 0.616934, 0.60276335, 0.5288949, 0.9786183, 0.41466194, 0.94374806, 0.5448832, - 0.56804454, 0.7991586, 0.2645556, 0.6818203, 0.4236548, 0.92559665, 0.46147937, 0.7742337, - 0.3595079, 0.6458941, 0.07103606, 0.7805292, 0.45615032, 0.43703195, 0.4375872, 0.0871293, - 0.11827443, 0.56843394, 0.6976312, 0.891773, 0.0202184, 0.639921, 0.0187898, 0.06022547, - 0.96366274, 0.83261985, 0.14335328, 0.6176355, 0.6667667}; - // float input_data[]={ + float16_t input_data[] = { + 0.5488135, 0.3834415, 0.77815676, 0.9446689, 0.6120957, 0.71518934, 0.79172504, 0.87001216, 0.5218483, + 0.616934, 0.60276335, 0.5288949, 0.9786183, 0.41466194, 0.94374806, 0.5448832, 0.56804454, 0.7991586, + 0.2645556, 0.6818203, 0.4236548, 0.92559665, 0.46147937, 0.7742337, 0.3595079, 0.6458941, 0.07103606, + 0.7805292, 0.45615032, 0.43703195, 0.4375872, 0.0871293, 0.11827443, 0.56843394, 0.6976312, 0.891773, + 0.0202184, 0.639921, 0.0187898, 0.06022547, 0.96366274, 0.83261985, 0.14335328, 0.6176355, 0.6667667}; + // float16_t input_data[]={ // 1 , 1 , 1 , 1 , 1 , // 1 , 1 , 1 , 1 , 1 , // 1 , 1 , 1 , 1 , 1 , @@ -550,23 +487,14 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) { // 1 , 1 , 1 , 1 , 1 , // 1 , 1 , 1 , 1 , 1 , // 1 , 1 , 1 , 1 , 1 }; - - // pack input - int IC4 = UP_DIV(conv_param->input_channel_, C4NUM); - int pack_input_size = C4NUM * IC4 * conv_param->input_h_ * conv_param->input_w_; - auto packed_input = std::make_unique<float>(pack_input_size); - memset(packed_input.get(), 0, pack_input_size * sizeof(float)); - int plane = conv_param->input_w_ * conv_param->input_h_; - PackNHWCToNC4HW4Fp32(input_data, packed_input.get(), 1, plane, conv_param->input_channel_); - // co h w ci - float weight_data[] = {0.67063785, 0.21038257, 0.12892629, 0.31542835, 0.36371076, 0.57019675, 0.43860152, 0.9883738, - 0.10204481, 0.20887676, 0.16130951, 0.6531083, 0.2532916, 0.46631077, 0.2444256, 0.15896958, - 0.11037514, 0.6563296, 0.13818295, 0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449, - 0.09609841, 0.97645944, 0.4686512, 0.9767611, 0.6048455, 0.7392636, 0.03918779, 0.28280696, - 0.12019656, 0.2961402, 0.11872772, 0.31798318, 0.41426298, 0.06414749, 0.6924721, 0.56660146, - 0.2653895, 0.5232481, 0.09394051, 0.5759465, 0.9292962}; - // float weight_data[]={ + float16_t weight_data[] = { + 0.67063785, 0.21038257, 0.12892629, 0.31542835, 0.36371076, 0.57019675, 0.43860152, 0.9883738, 0.10204481, + 0.20887676, 0.16130951, 0.6531083, 0.2532916, 0.46631077, 0.2444256, 0.15896958, 0.11037514, 0.6563296, + 0.13818295, 0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449, 0.09609841, 0.97645944, 0.4686512, + 0.9767611, 0.6048455, 0.7392636, 0.03918779, 0.28280696, 0.12019656, 0.2961402, 0.11872772, 0.31798318, + 0.41426298, 0.06414749, 0.6924721, 0.56660146, 0.2653895, 0.5232481, 0.09394051, 0.5759465, 0.9292962}; + // float16_t weight_data[]={ // 1 , 1 , 1 , // 1 , 1 , 1 , // 1 , 1 , 1 , @@ -582,95 +510,20 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) { // 1 , 1 , 1 , // 1 , 1 , 1 , // 1 , 1 , 1 }; - - // pack weight - int OC4 = UP_DIV(conv_param->output_channel_, C4NUM); - int pack_weight_size = conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_; - float *packed_weight = weight_data; - - // float bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0}; - float bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - size_t packed_output_size = conv_param->output_batch_ * C4NUM * UP_DIV(conv_param->output_channel_, C4NUM) * - conv_param->output_h_ * conv_param->output_w_; - - std::vector<int> shape_in = {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_, - IC4 * C4NUM}; // Note!!!actual is NHWC4 - std::vector<int> shape_filter = {1, conv_param->kernel_h_, conv_param->kernel_w_, conv_param->output_channel_}; - std::vector<int> shape_bias = {conv_param->output_channel_}; - std::vector<int> shape_out = {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_, - conv_param->output_channel_}; - auto tensor_a = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_in, - schema::Format_NC4HW4); // Note!!!actual is NHWC4 - auto tensor_b = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_filter, schema::Format_NHWC); - auto tensor_c = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_bias, schema::Format_NHWC); - auto tensor_d = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_out, schema::Format_NC4HW4); - std::vector<lite::tensor::Tensor *> inputs{tensor_a.get(), tensor_b.get(), tensor_c.get()}; - std::vector<lite::tensor::Tensor *> outputs{tensor_d.get()}; - - // freamework to do!!! - inputs[1]->SetData(packed_weight); - inputs[2]->SetData(bias_data); - - OpParameter *parameter = reinterpret_cast<OpParameter *>(conv_param.get()); - auto pKernel = std::make_unique<kernel::DepthwiseConv2dOpenCLKernel>(parameter, inputs, outputs); - pKernel->Init(); - - std::vector<kernel::LiteKernel *> kernels{pKernel.get()}; - std::vector<lite::tensor::Tensor *> inputs_{tensor_a.get()}; - inputs[0]->MallocData(); - auto pGraph = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_, outputs, kernels, kernels, kernels); - pGraph->Init(); - - // freamework to do!!! - memcpy(inputs[0]->Data(), packed_input.get(), sizeof(float) * pack_input_size); - - pGraph->Run(); - float *packed_output = reinterpret_cast<float *>(outputs[0]->Data()); - // pack correct data, nhwc - float correct_data[] = {1.189188, 1.0425153, 1.8012011, 0.6074867, 1.2120346, 1.5005531, 0.8346756, 2.4365785, + float16_t gnd_data[] = {1.189188, 1.0425153, 1.8012011, 0.6074867, 1.2120346, 1.5005531, 0.8346756, 2.4365785, 0.54975945, 1.6815965, 1.2690231, 0.60214907, 1.6158017, 0.42115876, 0.8854959, 1.1709145, 1.0929465, 1.3534508, 1.1985044, 1.2932993, 2.4621446, 1.7086457, 2.6977584, 2.1960166, 2.3769147, 2.3185873, 0.6133741, 0.9687358, 0.9987654, 1.0254729, 0.8368954, 0.74171704, 0.8749627, 0.8953936, 0.5093431, 1.5496738, 0.54936385, 0.7683113, 1.165742, 1.3682933, 1.0517888, 0.59817517, 0.75649744, 1.2075498, 0.38804203}; - auto packed_correct_data = std::make_unique<float>(packed_output_size); - memset(packed_correct_data.get(), 0, packed_output_size * sizeof(float)); - PackNHWCToNC4HW4Fp32(correct_data, packed_correct_data.get(), conv_param->output_batch_, - conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_); - - printf("==================input_data=================\n"); - std::cout << std::endl; - for (int i = 0; i < pack_input_size; i++) { - std::cout << packed_input.get()[i] << ", "; - } - std::cout << std::endl; - printf("==================weight data=================\n"); - std::cout << std::endl; - for (int i = 0; i < pack_weight_size; i++) { - std::cout << packed_weight[i] << ", "; - } - std::cout << std::endl; - printf("==================output data=================\n"); - std::cout << std::endl; - for (int i = 0; i < packed_output_size; i++) { - std::cout << packed_output[i] << ", "; - } - std::cout << std::endl; - printf("==================expected output data=================\n"); - for (int i = 0; i < packed_output_size; i++) { - std::cout << packed_correct_data.get()[i] << ", "; - } - std::cout << std::endl; - // compare - CommonTest::CompareOutputData(packed_output, packed_correct_data.get(), packed_output_size, 0.00001); - inputs[1]->SetData(nullptr); - inputs[2]->SetData(nullptr); - MS_LOG(INFO) << "TestConvolutionDwPadFp32 passed"; + lite::opencl::OpenCLRuntime::GetInstance()->SetFp16Enable(true); + DepthWiseTestMain<float16_t, float16_t>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4, + kNumberTypeFloat16, true, 1e-2); } -TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) { +TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2Fp32) { std::vector<std::vector<int>> src_shape{ {1, 32, 112, 112}, {1, 96, 112, 112}, {1, 144, 56, 56}, {1, 144, 56, 56}, {1, 192, 28, 28}, {1, 192, 28, 28}, {1, 384, 14, 14}, {1, 576, 14, 14}, {1, 576, 14, 14}, {1, 960, 7, 7}, @@ -685,101 +538,56 @@ TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) { }; // nhwc - size_t in_size = 96 * 112 * 112; - auto input_data = std::make_unique<float_t>(in_size); - memset(input_data.get(), 0, in_size); + const size_t in_size = 96 * 112 * 112; + float *input_data = new (std::nothrow) float[in_size]; + if (input_data == nullptr) { + return; + } + memset(input_data, 0, in_size * sizeof(float_t)); for (auto i = 0; i < in_size; ++i) { - input_data.get()[i] = 1; + input_data[i] = 1; } // co h w ci - size_t wt_size = 576 * 3 * 3; - auto weight_data = std::make_unique<float_t>(wt_size); - memset(weight_data.get(), 0, wt_size); + const size_t wt_size = 576 * 3 * 3; + float *weight_data = new (std::nothrow) float[wt_size]; + if (weight_data == nullptr) { + delete [] input_data; + return; + } + memset(weight_data, 0, wt_size); for (auto i = 0; i < wt_size; ++i) { - weight_data.get()[i] = 1; + weight_data[i] = 1; } - size_t out_size = 96 * 112 * 112; - auto gnd_data = std::make_unique<float_t>(out_size); - memset(gnd_data.get(), 0, out_size); - // for (auto i = 0; i < in_size; ++i) { - // gnd_data[i] = 1; - // } for (size_t i = 0; i < src_shape.size(); ++i) { const int MAX_RUN_TIMES = 1; for (int j = 0; j < MAX_RUN_TIMES; ++j) { printf("========profiling depthwise, in shape(%d,%d,%d,%d), out shape(%d,%d,%d,%d), iter%d========\n", src_shape[i][0], src_shape[i][1], src_shape[i][2], src_shape[i][3], dst_shape[i][0], dst_shape[i][1], dst_shape[i][2], dst_shape[i][3], j); - auto conv_param = std::make_unique<ConvParameter>(); + auto conv_param = ConvParameter(); { - conv_param->input_batch_ = 1; - conv_param->input_h_ = src_shape[i][2]; - conv_param->input_w_ = src_shape[i][3]; - conv_param->input_channel_ = src_shape[i][1]; - conv_param->output_batch_ = 1; - conv_param->output_h_ = dst_shape[i][2]; - conv_param->output_w_ = dst_shape[i][3]; - conv_param->output_channel_ = dst_shape[i][1]; - conv_param->kernel_h_ = filter_shape[i][1]; - conv_param->kernel_w_ = filter_shape[i][2]; - conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_; - conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_; - conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2; - conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2; - conv_param->dilation_h_ = 1; - conv_param->dilation_w_ = 1; + conv_param.input_batch_ = 1; + conv_param.input_h_ = src_shape[i][2]; + conv_param.input_w_ = src_shape[i][3]; + conv_param.input_channel_ = src_shape[i][1]; + conv_param.output_batch_ = 1; + conv_param.output_h_ = dst_shape[i][2]; + conv_param.output_w_ = dst_shape[i][3]; + conv_param.output_channel_ = dst_shape[i][1]; + conv_param.kernel_h_ = filter_shape[i][1]; + conv_param.kernel_w_ = filter_shape[i][2]; + conv_param.stride_h_ = conv_param.output_h_ / conv_param.input_h_; + conv_param.stride_w_ = conv_param.output_w_ / conv_param.input_w_; + conv_param.pad_u_ = (conv_param.kernel_h_ - 1) / 2; + conv_param.pad_l_ = (conv_param.kernel_w_ - 1) / 2; + conv_param.dilation_h_ = 1; + conv_param.dilation_w_ = 1; } - // DepthWiseTestMain(conv_param, input_data, weight_data, gnd_data, schema::Format_NC4HW4, false); - DepthWiseTestMain(conv_param.get(), input_data.get(), weight_data.get(), nullptr, schema::Format_NHWC4, false); + DepthWiseTestMain<float, float>(&conv_param, input_data, weight_data, nullptr, schema::Format_NHWC4, + kNumberTypeFloat32, false); } } -} - -TEST_F(TestConvolutionDwOpenCL, Buffer2Image) { - std::vector<int> src_shape{1, 96, 64, 64}; - std::vector<int> dst_shape{1, 96, 32, 32}; - std::vector<int> filter_shape{96, 3, 3, 1}; - - // nhwc - size_t in_size = 96 * 112 * 112; - auto input_data = std::make_unique<float_t>(in_size); - memset(input_data.get(), 0, in_size); - for (auto i = 0; i < in_size; ++i) { - input_data.get()[i] = 1; - } - // co h w ci - size_t wt_size = 576 * 3 * 3; - auto weight_data = std::make_unique<float_t>(wt_size); - memset(weight_data.get(), 0, wt_size); - for (auto i = 0; i < wt_size; ++i) { - weight_data.get()[i] = 1; - } - size_t out_size = 96 * 112 * 112; - auto gnd_data = std::make_unique<float_t>(out_size); - memset(gnd_data.get(), 0, out_size); - // for (auto i = 0; i < in_size; ++i) { - // gnd_data[i] = 1; - // } - auto conv_param = std::make_unique<ConvParameter>(); - { - conv_param->input_batch_ = 1; - conv_param->input_h_ = src_shape[2]; - conv_param->input_w_ = src_shape[3]; - conv_param->input_channel_ = src_shape[1]; - conv_param->output_batch_ = 1; - conv_param->output_h_ = dst_shape[2]; - conv_param->output_w_ = dst_shape[3]; - conv_param->output_channel_ = dst_shape[1]; - conv_param->kernel_h_ = filter_shape[1]; - conv_param->kernel_w_ = filter_shape[2]; - conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_; - conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_; - conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2; - conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2; - conv_param->dilation_h_ = 1; - conv_param->dilation_w_ = 1; - } - // DepthWiseTestMain(conv_param, input_data, weight_data, gnd_data, schema::Format_NC4HW4, true); - DepthWiseTestMain(conv_param.get(), input_data.get(), weight_data.get(), gnd_data.get(), schema::Format_NHWC4, true); + delete [] input_data; + delete [] weight_data; } } // namespace mindspore