diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
index a6e1c322e12f014ea8b679f722f8ec06085b6b84..17afaae4a95a4c14ca2d8c0e872d147f7ce8357b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
@@ -1,189 +1,186 @@
-__constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                         float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
+                                         float relu_clip, __write_only image2d_t dst_data, int2 kernel_size,
                                          int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
   FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
   int fx_c = Z * kernel_size.x * kernel_size.y;
   for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
+    int y_c = y_offset + ky * dilation.y;
     bool outside_y = y_c < 0 || y_c >= src_size.y;
     for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
+      int x_c = x_offset + kx * dilation.x;
       bool outside_x = x_c < 0 || x_c >= src_size.x;
       if (!outside_x && !outside_y) {
-        FLT4 f = filter[fx_c];
-        // FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))];
-        FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(x_c, (Z * src_size.y + y_c)));
-        r += TO_FLT4(src_final * f);
+        FLT4 flt_p = filter[fx_c];
+        FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(x_c, (Z * src_size.y + y_c)));
+        r += TO_FLT4(src_p * flt_p);
       }
       fx_c++;
     }
   }
-  FLT4 bias_val = bias[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  // dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0;
-  write_imagef(dst_data, (int2)(X, (Z * dst_size.y + Y)), res0);
+  FLT4 bias_p = bias[Z];
+  FLT4 res = TO_FLT4(r) + bias_p;
+  res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
+  WRITE_IMAGE(dst_data, (int2)(X, (Z * dst_size.y + Y)), res);
 }
 
 __kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                        float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
+                                        float relu_clip, __write_only image2d_t dst_data, int2 kernel_size,
                                         int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
   FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
   int fx_c = Z * kernel_size.x * kernel_size.y;
   for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
+    int y_c = y_offset + ky * dilation.y;
     bool outside_y = y_c < 0 || y_c >= src_size.y;
     for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
+      int x_c = x_offset + kx * dilation.x;
       bool outside_x = x_c < 0 || x_c >= src_size.x;
       if (!outside_x && !outside_y) {
-        FLT4 f = filter[fx_c];
-        // FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
-        FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z + x_c * src_size.z, y_c));
-        r += TO_FLT4(src_final * f);
+        FLT4 flt_p = filter[fx_c];
+        FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c));
+        r += TO_FLT4(src_p * flt_p);
       }
       fx_c++;
     }
   }
-  FLT4 bias_val = bias[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
-  write_imagef(dst_data, (int2)(X * dst_size.z + Z, Y), res0);
+  FLT4 bias_p = bias[Z];
+  FLT4 res = TO_FLT4(r) + bias_p;
+  res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
+  WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), res);
 }
 
 __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                            float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
+                                            float relu_clip, __write_only image2d_t dst_data, int2 kernel_size,
                                             int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
   FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
   int fx_c = Z;
   {
-    int y_c = y_offseted;
+    int y_c = y_offset;
     bool outside_y = y_c < 0 || y_c >= src_size.y;
     {
-      int x_c = x_offseted;
+      int x_c = x_offset;
       bool outside_x = x_c < 0 || x_c >= src_size.x;
       if (!outside_x && !outside_y) {
-        FLT4 f = filter[fx_c];
-        // FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
-        FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z));
-        r += TO_FLT4(src_final * f);
+        FLT4 flt_p = filter[fx_c];
+        // FLT4 src_p =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
+        FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z));
+        r += TO_FLT4(src_p * flt_p);
       }
     }
   }
-  FLT4 bias_val = bias[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
-  write_imagef(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res0);
+  FLT4 bias_p = bias[Z];
+  FLT4 res = TO_FLT4(r) + bias_p;
+  res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
+  // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res;
+  WRITE_IMAGE(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res);
 }
 __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                         float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
+                                         float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
                                          int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
   FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
   int fx_c = Z * kernel_size.x * kernel_size.y;
   for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
+    int y_c = y_offset + ky * dilation.y;
     bool outside_y = y_c < 0 || y_c >= src_size.y;
     for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
+      int x_c = x_offset + kx * dilation.x;
       bool outside_x = x_c < 0 || x_c >= src_size.x;
       if (!outside_x && !outside_y) {
-        FLT4 f = filter[fx_c];
-        FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
-        r += TO_FLT4(src_final * f);
+        FLT4 flt_p = filter[fx_c];
+        FLT4 src_p = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
+        r += TO_FLT4(src_p * flt_p);
       }
       fx_c++;
     }
   }
-  FLT4 bias_val = bias[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
+  FLT4 bias_p = bias[Z];
+  FLT4 res = TO_FLT4(r) + bias_p;
+  res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
+  dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res;
 }
 
 __kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                        float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
+                                        float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
                                         int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
   FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
   int fx_c = Z * kernel_size.x * kernel_size.y;
   for (int ky = 0; ky < kernel_size.y; ++ky) {
-    int y_c = y_offseted + ky * dilation.y;
+    int y_c = y_offset + ky * dilation.y;
     bool outside_y = y_c < 0 || y_c >= src_size.y;
     for (int kx = 0; kx < kernel_size.x; ++kx) {
-      int x_c = x_offseted + kx * dilation.x;
+      int x_c = x_offset + kx * dilation.x;
       bool outside_x = x_c < 0 || x_c >= src_size.x;
       if (!outside_x && !outside_y) {
-        FLT4 f = filter[fx_c];
-        FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
-        r += TO_FLT4(src_final * f);
+        FLT4 flt_p = filter[fx_c];
+        FLT4 src_p = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
+        r += TO_FLT4(src_p * flt_p);
       }
       fx_c++;
     }
   }
-  FLT4 bias_val = bias[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
+  FLT4 bias_p = bias[Z];
+  FLT4 res = TO_FLT4(r) + bias_p;
+  res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
+  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res;
 }
 
 __kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                            float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
+                                            float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
                                             int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   int Z = get_global_id(2);
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
   FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-  int x_offseted = X * stride.x + padding.x;
-  int y_offseted = Y * stride.y + padding.y;
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
   int fx_c = Z;
   {
-    int y_c = y_offseted;
+    int y_c = y_offset;
     bool outside_y = y_c < 0 || y_c >= src_size.y;
     {
-      int x_c = x_offseted;
+      int x_c = x_offset;
       bool outside_x = x_c < 0 || x_c >= src_size.x;
       if (!outside_x && !outside_y) {
-        FLT4 f = filter[fx_c];
-        FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
-        r += TO_FLT4(src_final * f);
+        FLT4 flt_p = filter[fx_c];
+        FLT4 src_p = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
+        r += TO_FLT4(src_p * flt_p);
       }
     }
   }
-  FLT4 bias_val = bias[Z];
-  FLT4 res0 = TO_FLT4(r) + bias_val;
-  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
+  FLT4 bias_p = bias[Z];
+  FLT4 res = TO_FLT4(r) + bias_p;
+  res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
+  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res;
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
index fecdcb488743b369167d3b4c4364c91719c656ed..bc08a62c2b2f01b9d826ebf28e37ac1e2c6461b7 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
@@ -1,15 +1,5 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
-                                          int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
 __kernel void to_format_NHWC_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
                                           int4 shape) {
   int X = get_global_id(0);
@@ -47,58 +37,17 @@ __kernel void to_format_NHWC4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only
   }
   WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(X * size.y + Y) * size.z + Z]);
 }
-__kernel void to_format_NC4HW4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
-                                            int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NCHW_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
-                                           int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
-                                           int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NHWC4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
-                                            int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
 __kernel void to_format_NC4HW4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
                                              int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
+  // size(h, w, c4, 1), shape(n, c, h, w)
+  int X = get_global_id(0);  // h
+  int Y = get_global_id(1);  // w
+  int Z = get_global_id(2);  // c4
   if (X >= size.x || Y >= size.y || Z >= size.z) {
     return;
   }
-  // FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
-  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(Y * size.z + Z) * size.x + X]);
+  WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), src_data[(Z * size.x + X) * size.y + Y]);
 }
-
 __kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
                                          int4 shape) {
   int X = get_global_id(0);
@@ -109,56 +58,6 @@ __kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __globa
   }
   dst_data[(Z * size.y + Y) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z));
 }
-__kernel void to_format_NHWC_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
-                                         int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NHWC4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
-                                          int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NC4HW4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
-                                           int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NCHW_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
-                                         int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
-__kernel void to_format_NHWC_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
-                                         int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
 __kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
                                           int4 shape) {
   int X = get_global_id(0);
@@ -185,25 +84,16 @@ __kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __glob
     }
   }
 }
-__kernel void to_format_NC4HW4_to_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
-                                              int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size.x || Y >= size.y || Z >= size.z) {
-    return;
-  }
-  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
-}
 __kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
                                              int4 shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
+  // size(h, w, c, 1), shape(n, c, h, w)
+  int X = get_global_id(0);  // h
+  int Y = get_global_id(1);  // w
+  int Z = get_global_id(2);  // c
   if (X >= size.x || Y >= size.y || Z >= size.z) {
     return;
   }
-  dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X));
+  dst_data[(Z * size.x + X) * size.y + Y] = READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X));
 }
 __kernel void to_format_NHWC4_to_NHWC4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
                                            int4 shape) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
index c3a9f681bc9315df3af1759a5f05cac58796f7ff..06e833c692e67b4656a2fd5a3d631a6803c81ca9 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@@ -20,9 +20,10 @@
 #include <utility>
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
-#include "src/runtime/kernel/arm/fp32/convolution_depthwise.h"
+#include "src/runtime/kernel/opencl/utils.h"
+#include "nnacl/fp32/common_func.h"
+#include "nnacl/op_base.h"
 #include "include/errorcode.h"
-#include "nnacl/pack.h"
 
 #ifndef PROGRAM_WITH_IL
 
@@ -81,30 +82,50 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() {
   auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   auto allocator = ocl_runtime->GetAllocator();
+  bool is_fp16 = ocl_runtime->GetFp16Enable();
 
   // weight: o, h, w, i; o == group, i == 1
-  auto origin_weight = reinterpret_cast<FLOAT_t *>(in_tensors_.at(kWeightIndex)->Data());
+  void *origin_weight = in_tensors_.at(kWeightIndex)->Data();
   int CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
   int pack_weight_size = C4NUM * CO4 * parameter->kernel_h_ * parameter->kernel_w_;
 
-  packed_weight_ = reinterpret_cast<FLOAT_t *>(allocator->Malloc(pack_weight_size * sizeof(FLOAT_t)));
-  packed_weight_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true));
   int plane = parameter->kernel_h_ * parameter->kernel_w_;
-#ifdef ENABLE_FP16
-  PackNCHWToNC4HW4Fp16(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel());
-#else
-  PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel());
-#endif
+  if (is_fp16) {
+    packed_weight_ = allocator->Malloc(pack_weight_size * sizeof(int16_t));
+    packed_weight_ = allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
+    if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) {
+      std::function<int16_t(int16_t)> to_dtype = [](int16_t x) -> int16_t { return x; };
+      PackNCHWToNC4HW4<int16_t, int16_t>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype);
+    } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) {
+      std::function<int16_t(float)> to_dtype = Float32ToShort;
+      PackNCHWToNC4HW4<float, int16_t>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype);
+    } else {
+      MS_LOG(ERROR) << "Only support float16/float32, actual data type " << in_tensors_.at(kWeightIndex)->data_type();
+    }
+  } else {
+    packed_weight_ = allocator->Malloc(pack_weight_size * sizeof(float));
+    packed_weight_ = allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
+    if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) {
+      std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; };
+      PackNCHWToNC4HW4<float, float>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype);
+    } else {
+      MS_LOG(ERROR) << "Only support float16/float32, actual data type " << in_tensors_.at(kWeightIndex)->data_type();
+    }
+  }
 
   allocator->UnmapBuffer(packed_weight_);
 
   if (in_tensors_.size() == kInputSize2) {
-    bias_data_ = reinterpret_cast<FLOAT_t *>(allocator->Malloc(C4NUM * CO4 * sizeof(FLOAT_t)));
-    bias_data_ = reinterpret_cast<FLOAT_t *>(allocator->MapBuffer(bias_data_, CL_MAP_WRITE, nullptr, true));
-    size_t up_co_size = C4NUM * CO4 * sizeof(FLOAT_t);
+    size_t dtype_size = sizeof(float);
+    if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) {
+      dtype_size = sizeof(int16_t);
+    }
+    bias_data_ = allocator->Malloc(C4NUM * CO4 * dtype_size);
+    bias_data_ = allocator->MapBuffer(bias_data_, CL_MAP_WRITE, nullptr, true);
+    size_t up_co_size = C4NUM * CO4 * dtype_size;
     memset(bias_data_, 0, up_co_size);
-    auto ori_bias = reinterpret_cast<FLOAT_t *>(in_tensors_.at(kBiasIndex)->Data());
-    memcpy(bias_data_, ori_bias, out_tensors_[0]->Channel() * sizeof(FLOAT_t));
+    auto ori_bias = in_tensors_.at(kBiasIndex)->Data();
+    memcpy(bias_data_, ori_bias, out_tensors_[0]->Channel() * dtype_size);
     allocator->UnmapBuffer(bias_data_);
   } else {
     MS_ASSERT(in_tensors_.size() == kInputSize1);
@@ -124,11 +145,10 @@ int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *i
     im_dst_y = out_tensors_[0]->Height() * CO4;
     im_dst_x = out_tensors_[0]->Width();
   }
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
   size_t img_dtype = CL_FLOAT;
-#endif
+  if (lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable()) {
+    img_dtype = CL_HALF_FLOAT;
+  }
   img_size->clear();
   std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
   *img_size = vec;
@@ -204,5 +224,6 @@ kernel::LiteKernel *OpenCLDepthwiseConv2dKernelCreator(const std::vector<lite::t
   return kernel;
 }
 
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DepthwiseConv2D, OpenCLDepthwiseConv2dKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, OpenCLDepthwiseConv2dKernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
index 22c167b964b4809d3190b5b9cad2d833b0e3aea0..6564054483b6e480db6b28ebd3741cdf187688f7 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
@@ -20,7 +20,6 @@
 #include <vector>
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "nnacl/conv_parameter.h"
-
 #include "src/runtime/opencl/opencl_runtime.h"
 
 namespace mindspore::kernel {
@@ -46,8 +45,8 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {
   int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override;
 
  private:
-  FLOAT_t *packed_weight_;
-  FLOAT_t *bias_data_;
+  void *packed_weight_;
+  void *bias_data_;
   cl::Kernel kernel_;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
index 7c4dd4546d9166298f9070c21ff327357755d72a..01f4572dbd7687d2d951925b61a92afd86e1960b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@@ -172,5 +172,6 @@ kernel::LiteKernel *OpenCLToFormatKernelCreator(const std::vector<lite::tensor::
   return kernel;
 }
 
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
index 4b19d71dc7bca37eedc3e17f8c20fa77d7222d91..bf269120f2edf6cb55a7c5091f52ff3818e6a390 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
@@ -93,11 +93,10 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::tensor::Tensor *
     }
 
     out_tensors->emplace_back(new_tensor);
-#ifdef ENABLE_FP16
-    KernelKey desc{kGPU, kNumberTypeFloat16, schema::PrimitiveType_ToFormat};
-#else
     KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat};
-#endif
+    if (lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable()) {
+      desc.data_type = kNumberTypeFloat16;
+    }
     OpenCLToFormatParameter *parameter = new (std::nothrow) OpenCLToFormatParameter;
     MS_ASSERT(parameter);
     if (parameter == nullptr) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h
index 07a87547bb9aef131ee5e9238c1d7cb1de9631b6..c498b65c3ff192c0a7773d101f6ecead0ce1fef0 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@@ -23,6 +23,7 @@
 #include "utils/log_adapter.h"
 #include "nnacl/op_base.h"
 #include "src/lite_kernel.h"
+#include "src/common//utils.h"
 
 namespace mindspore::lite {
 kernel::LiteKernel *GetOpenCLKernel(const std::vector<tensor::Tensor *> &in_tensors,
@@ -89,6 +90,73 @@ std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int ma
 
 std::string CLErrorCode(cl_int error_code);
 
+template <class T1, class T2>
+void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel,
+                      const std::function<T2(T1)> &to_dtype) {
+  int c4 = UP_DIV(channel, C4NUM);
+  for (int b = 0; b < batch; b++) {
+    int src_offset = b * plane * channel;
+    int dst_offset = b * plane * c4 * C4NUM;
+    for (int c = 0; c < channel; c++) {
+      int c4_block_num = c / C4NUM;
+      int c4_block_rem = c % C4NUM;
+      int src_c_offset = src_offset + c * plane;
+      int dst_c_offset = dst_offset + c4_block_num * plane * C4NUM;
+      for (int k = 0; k < plane; k++) {
+        int src_kernel_offset = src_c_offset + k;
+        int dst_kernel_offset = dst_c_offset + C4NUM * k + c4_block_rem;
+        (static_cast<T2 *>(dst) + dst_kernel_offset)[0] =
+          to_dtype((static_cast<T1 *>(src) + src_kernel_offset)[0]);
+      }
+    }
+  }
+}
+template <class T1, class T2>
+void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel,
+                     const std::function<T2(T1)> &to_dtype) {
+  int c4 = UP_DIV(channel, C4NUM);
+  int nhwc4_batch_unit_offset = c4 * C4NUM * plane;
+  int ic_remainder_ = channel % C4NUM;
+  if (ic_remainder_ != 0) {
+    int nhwc4_batch_offset = 0;
+    for (int b = 0; b < batch; b++) {
+      int batch_offset = b * channel * plane;
+      for (int i = 0; i < plane; ++i) {
+        for (int c = 0; c < channel; ++c) {
+          (static_cast<T2 *>(dst) + nhwc4_batch_offset + i * c4 * C4NUM + c)[0] =
+            to_dtype((static_cast<T1 *>(src) + batch_offset + i * channel + c)[0]);
+        }
+      }
+      nhwc4_batch_offset += nhwc4_batch_unit_offset;
+    }
+  } else {
+    size_t ori_input_size = batch * plane * channel;
+    for (size_t n = 0; n < ori_input_size; ++n) {
+      (static_cast<T2 *>(dst) + n)[0] = to_dtype((static_cast<T1 *>(src) + n)[0]);
+    }
+  }
+}
+template <class T1, class T2>
+void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel,
+                      const std::function<T2(T1)> &to_dtype) {
+  int c4 = UP_DIV(channel, C4NUM);
+  for (int b = 0; b < batch; b++) {
+    int src_oc_offset = b * plane * channel;
+    int dst_oc_offset = b * plane * c4 * C4NUM;
+    for (int k = 0; k < plane; k++) {
+      int src_kernel_offset = src_oc_offset + k * channel;
+      int dst_kernel_offset = dst_oc_offset + k * C4NUM;
+      for (int i = 0; i < channel; i++) {
+        int c4_block_num = i / C4NUM;
+        int c4_block_rem = i % C4NUM;
+        int src_ic_offset = src_kernel_offset + i;
+        int dst_ic_offset = dst_kernel_offset + c4_block_num * plane * C4NUM + c4_block_rem;
+        (static_cast<T2 *>(dst) + dst_ic_offset)[0] = to_dtype((static_cast<T1 *>(src) + src_ic_offset)[0]);
+      }
+    }
+  }
+}
+
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_BACKEND_OPENCL_UTILS_H_
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
index a88a23a5c25a30e160b1ac0896926b32d86538e9..3fb61f1d55c1269b69794d869fa42e8f31968583 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
@@ -19,20 +19,20 @@
 #include "common/common_test.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "nnacl/pack.h"
+#include "src/runtime/kernel/opencl/utils.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h"
 
-bool IMAGE2D_OPEN = true;
-
 namespace mindspore {
 class TestConvolutionDwOpenCL : public mindspore::CommonTest {
  public:
   TestConvolutionDwOpenCL() {}
 };
 
-void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t *weight_data, float_t *gnd_data,
-                       schema::Format format, bool is_compare = true) {
+template <class T1, class T2>
+void DepthWiseTestMain(ConvParameter *conv_param, T2 *input_data, T1 *weight_data, T2 *gnd_data, schema::Format format,
+                       TypeId dtype = kNumberTypeFloat32, bool is_compare = true, T2 err_max = 1e-5) {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
@@ -40,45 +40,54 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t *
   // pack input
   int IC4 = UP_DIV(conv_param->input_channel_, C4NUM);
   int pack_input_size = C4NUM * IC4 * conv_param->input_h_ * conv_param->input_w_;
-  auto packed_input = std::make_unique<float>(pack_input_size);
-  if (packed_input.get() == nullptr) {
+  auto packed_input = new (std::nothrow) T2[pack_input_size];
+  if (packed_input == nullptr) {
     return;
   }
-  memset(packed_input.get(), 0, pack_input_size * sizeof(float));
+  memset(packed_input, 0, pack_input_size * sizeof(T2));
   int plane = conv_param->input_w_ * conv_param->input_h_;
+  std::function<T2(T2)> to_dtype = [](T2 x) -> T2 { return x; };
   if (format == schema::Format_NHWC4) {
-    PackNHWCToNHWC4Fp32(input_data, packed_input.get(), 1, plane, conv_param->input_channel_);
+    kernel::PackNHWCToNHWC4<T2, T2>(input_data, packed_input, 1, plane, conv_param->input_channel_, to_dtype);
   } else {
-    PackNHWCToNC4HW4Fp32(input_data, packed_input.get(), 1, plane, conv_param->input_channel_);
+    kernel::PackNHWCToNC4HW4<T2, T2>(input_data, packed_input, 1, plane, conv_param->input_channel_, to_dtype);
   }
 
   // pack weight
   int OC4 = UP_DIV(conv_param->output_channel_, C4NUM);
   int pack_weight_size = conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_;
-  float *packed_weight = weight_data;
+  T1 *packed_weight = weight_data;
 
-  // float bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0};
-  float bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+  // T1 bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0};
+  T1 bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
   size_t packed_output_size = conv_param->output_batch_ * C4NUM * UP_DIV(conv_param->output_channel_, C4NUM) *
                               conv_param->output_h_ * conv_param->output_w_;
 
-  std::vector<int> shape_in = {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_,
-                               conv_param->input_channel_};  // Note!!!actual is NHWC4
   std::vector<int> shape_filter = {1, conv_param->kernel_h_, conv_param->kernel_w_, conv_param->output_channel_};
   std::vector<int> shape_bias = {conv_param->output_channel_};
-  std::vector<int> shape_out = {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_,
-                                conv_param->output_channel_};
-  auto tensor_a =
-    std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_in, format);  // Note!!!actual is NHWC4
-  auto tensor_b = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_filter, schema::Format_NHWC);
-  auto tensor_c = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_bias, schema::Format_NHWC);
-  auto tensor_d = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_out, format);
-  std::vector<lite::tensor::Tensor *> inputs{tensor_a.get(), tensor_b.get(), tensor_c.get()};
-  std::vector<lite::tensor::Tensor *> outputs{tensor_d.get()};
-  if (tensor_a.get() == nullptr || tensor_b.get() == nullptr || tensor_c.get() == nullptr ||
-      tensor_d.get() == nullptr) {
+  std::vector<int> shape_out;
+  std::vector<int> shape_in;
+  if (format == schema::Format_NHWC || format == schema::Format_NHWC4) {
+    shape_in = std::vector<int>(
+      {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_, conv_param->input_channel_});
+    shape_out = std::vector<int>(
+      {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_, conv_param->output_channel_});
+  } else if (format == schema::Format_NCHW || format == schema::Format_NC4HW4) {
+    shape_in = std::vector<int>(
+      {conv_param->input_batch_, conv_param->input_channel_, conv_param->input_h_, conv_param->input_w_});
+    shape_out = std::vector<int>(
+      {conv_param->output_batch_, conv_param->output_channel_, conv_param->output_h_, conv_param->output_w_});
+  } else {
+    MS_LOG(ERROR) << "Unsupported format: " << format;
+    delete[] packed_input;
     return;
   }
+  auto tensor_a = lite::tensor::Tensor(TypeId(dtype), shape_in, format);
+  auto tensor_b = lite::tensor::Tensor(TypeId(dtype), shape_filter, schema::Format_NHWC);
+  auto tensor_c = lite::tensor::Tensor(TypeId(dtype), shape_bias, schema::Format_NHWC);
+  auto tensor_d = lite::tensor::Tensor(TypeId(dtype), shape_out, format);
+  std::vector<lite::tensor::Tensor *> inputs{&tensor_a, &tensor_b, &tensor_c};
+  std::vector<lite::tensor::Tensor *> outputs{&tensor_d};
 
   // freamework to do!!!
   inputs[1]->SetData(packed_weight);
@@ -87,43 +96,48 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t *
   OpParameter *parameter = reinterpret_cast<OpParameter *>(conv_param);
   auto pKernel = std::make_unique<kernel::DepthwiseConv2dOpenCLKernel>(parameter, inputs, outputs);
   if (pKernel.get() == nullptr) {
+    delete[] packed_input;
     return;
   }
   pKernel->Init();
 
   std::vector<kernel::LiteKernel *> kernels{pKernel.get()};
-  std::vector<lite::tensor::Tensor *> inputs_{tensor_a.get()};
+  std::vector<lite::tensor::Tensor *> inputs_{&tensor_a};
   size_t C4 = UP_DIV(inputs[0]->Channel(), C4NUM);
-  inputs[0]->MallocData(allocator);
   auto pGraph = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_, outputs, kernels, kernels, kernels);
-  if (pKernel.get() == nullptr) {
+  if (pGraph.get() == nullptr) {
+    delete[] packed_input;
     return;
   }
   pGraph->Init();
 
   // freamework to do!!!
-  memcpy(inputs[0]->Data(), packed_input.get(), sizeof(float) * pack_input_size);
+  inputs[0]->MallocData(allocator);
+  memcpy(inputs[0]->Data(), packed_input, sizeof(T2) * pack_input_size);
 
   pGraph->Run();
   if (is_compare) {
-    float_t *packed_output = reinterpret_cast<float *>(outputs[0]->Data());
-    auto packed_correct_data = std::make_unique<float_t>(packed_output_size);
-    if (packed_correct_data) {
+    T2 *packed_output = reinterpret_cast<T2 *>(outputs[0]->Data());
+    auto packed_correct_data = std::make_unique<T2>(packed_output_size);
+    if (packed_correct_data.get() == nullptr) {
+      delete[] packed_input;
       return;
     }
-    memset(packed_correct_data.get(), 0, packed_output_size * sizeof(float_t));
+    memset(packed_correct_data.get(), 0, packed_output_size * sizeof(T2));
     if (format == schema::Format_NC4HW4) {
-      PackNHWCToNC4HW4Fp32(gnd_data, packed_correct_data.get(), conv_param->output_batch_,
-                           conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_);
+      kernel::PackNHWCToNC4HW4<T2, T2>(gnd_data, packed_correct_data.get(), conv_param->output_batch_,
+                                       conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_,
+                                       to_dtype);
     } else {
-      PackNHWCToNHWC4Fp32(gnd_data, packed_correct_data.get(), conv_param->output_batch_,
-                          conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_);
+      kernel::PackNHWCToNHWC4<T2, T2>(gnd_data, packed_correct_data.get(), conv_param->output_batch_,
+                                      conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_,
+                                      to_dtype);
     }
 
     printf("==================input_data=================\n");
     std::cout << std::endl;
     for (int i = 0; i < pack_input_size; i++) {
-      std::cout << packed_input.get()[i] << ", ";
+      std::cout << packed_input[i] << ", ";
     }
     std::cout << std::endl;
     printf("==================weight data=================\n");
@@ -134,7 +148,7 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t *
     std::cout << std::endl;
     printf("==================output data=================\n");
     std::cout << std::endl;
-    for (int i = 0; i < 80 /*packed_output_size*/; i++) {
+    for (int i = 0; i < packed_output_size; i++) {
       std::cout << packed_output[i] << ", ";
     }
     std::cout << std::endl;
@@ -144,11 +158,12 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t *
     }
     std::cout << std::endl;
     // compare
-    CommonTest::CompareOutputData(packed_output, packed_correct_data.get(), packed_output_size, 0.00001);
+    CommonTest::CompareOutputData<T2>(packed_output, packed_correct_data.get(), packed_output_size, err_max);
   }
 
   inputs[1]->SetData(nullptr);
   inputs[2]->SetData(nullptr);
+  delete[] packed_input;
   return;
 }
 
@@ -194,7 +209,7 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) {
   float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
                       2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
 
-  DepthWiseTestMain(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
+  DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
 }
 
 TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) {
@@ -266,7 +281,7 @@ TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) {
                       0.8749627,  0.8953936,  0.5093431,  1.5496738,  0.54936385, 0.7683113,  1.165742,  1.3682933,
                       1.0517888,  0.59817517, 0.75649744, 1.2075498,  0.38804203};
 
-  DepthWiseTestMain(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
+  DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
 }
 
 TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) {
@@ -311,7 +326,7 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) {
   float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
                       2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
 
-  DepthWiseTestMain(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
+  DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
 }
 
 TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) {
@@ -383,12 +398,10 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) {
                       0.8749627,  0.8953936,  0.5093431,  1.5496738,  0.54936385, 0.7683113,  1.165742,  1.3682933,
                       1.0517888,  0.59817517, 0.75649744, 1.2075498,  0.38804203};
 
-  DepthWiseTestMain(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
+  DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
 }
 
-TEST_F(TestConvolutionDwOpenCL, ConvDwNoPadFp32) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  ocl_runtime->Init();
+TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp16) {
   auto conv_param = std::make_unique<ConvParameter>();
   {
     conv_param->input_batch_ = 1;
@@ -410,109 +423,33 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwNoPadFp32) {
   }
 
   // nhwc
-  float input_data[] = {0.5488135,  0.0202184,  0.45615032, 0.31542835, 0.71518934, 0.83261985, 0.56843394, 0.36371076,
-                        0.60276335, 0.77815676, 0.0187898,  0.57019675, 0.5448832,  0.87001216, 0.6176355,  0.43860152,
-                        0.4236548,  0.9786183,  0.6120957,  0.9883738,  0.6458941,  0.7991586,  0.616934,   0.10204481,
-                        0.4375872,  0.46147937, 0.94374806, 0.20887676, 0.891773,   0.7805292,  0.6818203,  0.16130951,
-                        0.96366274, 0.11827443, 0.3595079,  0.6531083,  0.3834415,  0.639921,   0.43703195, 0.2532916,
-                        0.79172504, 0.14335328, 0.6976312,  0.46631077, 0.5288949,  0.9446689,  0.06022547, 0.2444256,
-                        0.56804454, 0.5218483,  0.6667667,  0.15896958, 0.92559665, 0.41466194, 0.67063785, 0.11037514,
-                        0.07103606, 0.2645556,  0.21038257, 0.6563296,  0.0871293,  0.7742337,  0.12892629, 0.13818295};
-
-  // pack input
-  int IC4 = UP_DIV(conv_param->input_channel_, C4NUM);
-  int pack_input_size = C4NUM * IC4 * conv_param->input_h_ * conv_param->input_w_;
-  float *packed_input = input_data;
+  float16_t input_data[] = {
+    0.5488135,  0.0202184,  0.45615032, 0.31542835, 0.71518934, 0.83261985, 0.56843394, 0.36371076,
+    0.60276335, 0.77815676, 0.0187898,  0.57019675, 0.5448832,  0.87001216, 0.6176355,  0.43860152,
+    0.4236548,  0.9786183,  0.6120957,  0.9883738,  0.6458941,  0.7991586,  0.616934,   0.10204481,
+    0.4375872,  0.46147937, 0.94374806, 0.20887676, 0.891773,   0.7805292,  0.6818203,  0.16130951,
+    0.96366274, 0.11827443, 0.3595079,  0.6531083,  0.3834415,  0.639921,   0.43703195, 0.2532916,
+    0.79172504, 0.14335328, 0.6976312,  0.46631077, 0.5288949,  0.9446689,  0.06022547, 0.2444256,
+    0.56804454, 0.5218483,  0.6667667,  0.15896958, 0.92559665, 0.41466194, 0.67063785, 0.11037514,
+    0.07103606, 0.2645556,  0.21038257, 0.6563296,  0.0871293,  0.7742337,  0.12892629, 0.13818295};
 
   // co h w ci
-  float weight_data[] = {0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,  0.09609841, 0.97645944, 0.4686512,
-                         0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696, 0.12019656, 0.2961402,  0.11872772,
-                         0.31798318, 0.41426298, 0.06414749, 0.6924721,  0.56660146, 0.2653895,  0.5232481,  0.09394051,
-                         0.5759465,  0.9292962,  0.31856894, 0.6674104,  0.13179787, 0.7163272,  0.2894061,  0.18319136,
-                         0.5865129,  0.02010755, 0.82894003, 0.00469548};
-
-  // pack weight
-  int OC4 = UP_DIV(conv_param->output_channel_, C4NUM);
-  int pack_weight_size = C4NUM * OC4 * conv_param->kernel_h_ * conv_param->kernel_w_;
-  float *packed_weight = weight_data;
-
-  // float bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0};
-  float bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-  size_t packed_output_size = conv_param->output_batch_ * C4NUM * UP_DIV(conv_param->output_channel_, C4NUM) *
-                              conv_param->output_h_ * conv_param->output_w_;
-
-  std::vector<int> shape_in = {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_,
-                               IC4 * C4NUM};  // Note!!!actual is NHWC4
-  std::vector<int> shape_filter = {1, conv_param->kernel_h_, conv_param->kernel_w_, conv_param->output_channel_};
-  std::vector<int> shape_bias = {conv_param->output_channel_};
-  std::vector<int> shape_out = {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_,
-                                conv_param->output_channel_};
-  auto tensor_a = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_in,
-                                                         schema::Format_NC4HW4);  // Note!!!actual is NHWC4
-  auto tensor_b = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_filter, schema::Format_NHWC);
-  auto tensor_c = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_bias, schema::Format_NHWC);
-  auto tensor_d = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_out, schema::Format_NC4HW4);
-  std::vector<lite::tensor::Tensor *> inputs{tensor_a.get(), tensor_b.get(), tensor_c.get()};
-  std::vector<lite::tensor::Tensor *> outputs{tensor_d.get()};
-
-  // freamework to do!!!
-  inputs[1]->SetData(packed_weight);
-  inputs[2]->SetData(bias_data);
-
-  OpParameter *parameter = reinterpret_cast<OpParameter *>(conv_param.get());
-  auto pKernel = std::make_unique<kernel::DepthwiseConv2dOpenCLKernel>(parameter, inputs, outputs);
-  pKernel->Init();
-
-  std::vector<kernel::LiteKernel *> kernels{pKernel.get()};
-  std::vector<lite::tensor::Tensor *> inputs_{tensor_a.get()};
-  inputs[0]->MallocData();
-  auto pGraph = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_, outputs, kernels, kernels, kernels);
-  pGraph->Init();
-
-  // freamework to do!!!
-  memcpy(inputs[0]->Data(), packed_input, sizeof(float) * pack_input_size);
-
-  pGraph->Run();
-  float *packed_output = reinterpret_cast<float *>(outputs[0]->Data());
+  float16_t weight_data[] = {
+    0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,  0.09609841, 0.97645944, 0.4686512,  0.9767611,
+    0.6048455,  0.7392636,  0.03918779, 0.28280696, 0.12019656, 0.2961402,  0.11872772, 0.31798318, 0.41426298,
+    0.06414749, 0.6924721,  0.56660146, 0.2653895,  0.5232481,  0.09394051, 0.5759465,  0.9292962,  0.31856894,
+    0.6674104,  0.13179787, 0.7163272,  0.2894061,  0.18319136, 0.5865129,  0.02010755, 0.82894003, 0.00469548};
 
   // pack correct data, nhwc
-  float packed_correct_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
-                                 2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
+  float16_t gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
+                          2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
 
-  printf("==================input_data=================\n");
-  std::cout << std::endl;
-  for (int i = 0; i < pack_input_size; i++) {
-    std::cout << packed_input[i] << ", ";
-  }
-  std::cout << std::endl;
-  printf("==================packed_weight data=================\n");
-  std::cout << std::endl;
-  for (int i = 0; i < pack_weight_size; i++) {
-    std::cout << packed_weight[i] << ", ";
-  }
-  std::cout << std::endl;
-  printf("==================output data=================\n");
-  std::cout << std::endl;
-  for (int i = 0; i < packed_output_size; i++) {
-    std::cout << packed_output[i] << ", ";
-  }
-  std::cout << std::endl;
-  printf("==================expected output data=================\n");
-  for (int i = 0; i < packed_output_size; i++) {
-    std::cout << packed_correct_data[i] << ", ";
-  }
-  std::cout << std::endl;
-  // compare
-  CommonTest::CompareOutputData(packed_output, packed_correct_data, packed_output_size, 0.00001);
-
-  inputs[1]->SetData(nullptr);
-  inputs[2]->SetData(nullptr);
-  MS_LOG(INFO) << "TestConvolutionDwNoPadFp32 passed";
+  lite::opencl::OpenCLRuntime::GetInstance()->SetFp16Enable(true);
+  DepthWiseTestMain<float16_t, float16_t>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4,
+                                          kNumberTypeFloat16, true, 1e-2);
 }
 
-TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) {
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  ocl_runtime->Init();
+TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp16) {
   auto conv_param = std::make_unique<ConvParameter>();
   {
     conv_param->input_batch_ = 1;
@@ -534,13 +471,13 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) {
   }
 
   // nhwc
-  float input_data[] = {0.5488135,  0.3834415,  0.77815676, 0.9446689, 0.6120957,  0.71518934, 0.79172504, 0.87001216,
-                        0.5218483,  0.616934,   0.60276335, 0.5288949, 0.9786183,  0.41466194, 0.94374806, 0.5448832,
-                        0.56804454, 0.7991586,  0.2645556,  0.6818203, 0.4236548,  0.92559665, 0.46147937, 0.7742337,
-                        0.3595079,  0.6458941,  0.07103606, 0.7805292, 0.45615032, 0.43703195, 0.4375872,  0.0871293,
-                        0.11827443, 0.56843394, 0.6976312,  0.891773,  0.0202184,  0.639921,   0.0187898,  0.06022547,
-                        0.96366274, 0.83261985, 0.14335328, 0.6176355, 0.6667667};
-  // float input_data[]={
+  float16_t input_data[] = {
+    0.5488135, 0.3834415,  0.77815676, 0.9446689,  0.6120957,  0.71518934, 0.79172504, 0.87001216, 0.5218483,
+    0.616934,  0.60276335, 0.5288949,  0.9786183,  0.41466194, 0.94374806, 0.5448832,  0.56804454, 0.7991586,
+    0.2645556, 0.6818203,  0.4236548,  0.92559665, 0.46147937, 0.7742337,  0.3595079,  0.6458941,  0.07103606,
+    0.7805292, 0.45615032, 0.43703195, 0.4375872,  0.0871293,  0.11827443, 0.56843394, 0.6976312,  0.891773,
+    0.0202184, 0.639921,   0.0187898,  0.06022547, 0.96366274, 0.83261985, 0.14335328, 0.6176355,  0.6667667};
+  // float16_t input_data[]={
   //   1  , 1  , 1 , 1  , 1  ,
   //   1  , 1  , 1 , 1  , 1  ,
   //   1  , 1  , 1 , 1  , 1  ,
@@ -550,23 +487,14 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) {
   //   1  , 1  , 1 , 1  , 1  ,
   //   1  , 1  , 1 , 1  , 1  ,
   //   1  , 1  , 1 , 1  , 1  };
-
-  // pack input
-  int IC4 = UP_DIV(conv_param->input_channel_, C4NUM);
-  int pack_input_size = C4NUM * IC4 * conv_param->input_h_ * conv_param->input_w_;
-  auto packed_input = std::make_unique<float>(pack_input_size);
-  memset(packed_input.get(), 0, pack_input_size * sizeof(float));
-  int plane = conv_param->input_w_ * conv_param->input_h_;
-  PackNHWCToNC4HW4Fp32(input_data, packed_input.get(), 1, plane, conv_param->input_channel_);
-
   // co h w ci
-  float weight_data[] = {0.67063785, 0.21038257, 0.12892629, 0.31542835, 0.36371076, 0.57019675, 0.43860152, 0.9883738,
-                         0.10204481, 0.20887676, 0.16130951, 0.6531083,  0.2532916,  0.46631077, 0.2444256,  0.15896958,
-                         0.11037514, 0.6563296,  0.13818295, 0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,
-                         0.09609841, 0.97645944, 0.4686512,  0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696,
-                         0.12019656, 0.2961402,  0.11872772, 0.31798318, 0.41426298, 0.06414749, 0.6924721,  0.56660146,
-                         0.2653895,  0.5232481,  0.09394051, 0.5759465,  0.9292962};
-  // float weight_data[]={
+  float16_t weight_data[] = {
+    0.67063785, 0.21038257, 0.12892629, 0.31542835, 0.36371076, 0.57019675, 0.43860152, 0.9883738,  0.10204481,
+    0.20887676, 0.16130951, 0.6531083,  0.2532916,  0.46631077, 0.2444256,  0.15896958, 0.11037514, 0.6563296,
+    0.13818295, 0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,  0.09609841, 0.97645944, 0.4686512,
+    0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696, 0.12019656, 0.2961402,  0.11872772, 0.31798318,
+    0.41426298, 0.06414749, 0.6924721,  0.56660146, 0.2653895,  0.5232481,  0.09394051, 0.5759465,  0.9292962};
+  // float16_t weight_data[]={
   //   1  , 1  , 1 ,
   //   1  , 1  , 1 ,
   //   1  , 1  , 1 ,
@@ -582,95 +510,20 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) {
   //   1  , 1  , 1 ,
   //   1  , 1  , 1 ,
   //   1  , 1  , 1 };
-
-  // pack weight
-  int OC4 = UP_DIV(conv_param->output_channel_, C4NUM);
-  int pack_weight_size = conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_;
-  float *packed_weight = weight_data;
-
-  // float bias_data[] = {0.31856894, 0.6674104, 0.13179787, 0.7163272, 0.2894061, 0.0, 0.0, 0.0};
-  float bias_data[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-  size_t packed_output_size = conv_param->output_batch_ * C4NUM * UP_DIV(conv_param->output_channel_, C4NUM) *
-                              conv_param->output_h_ * conv_param->output_w_;
-
-  std::vector<int> shape_in = {conv_param->input_batch_, conv_param->input_h_, conv_param->input_w_,
-                               IC4 * C4NUM};  // Note!!!actual is NHWC4
-  std::vector<int> shape_filter = {1, conv_param->kernel_h_, conv_param->kernel_w_, conv_param->output_channel_};
-  std::vector<int> shape_bias = {conv_param->output_channel_};
-  std::vector<int> shape_out = {conv_param->output_batch_, conv_param->output_h_, conv_param->output_w_,
-                                conv_param->output_channel_};
-  auto tensor_a = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_in,
-                                                         schema::Format_NC4HW4);  // Note!!!actual is NHWC4
-  auto tensor_b = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_filter, schema::Format_NHWC);
-  auto tensor_c = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_bias, schema::Format_NHWC);
-  auto tensor_d = std::make_unique<lite::tensor::Tensor>(TypeId(kNumberTypeFloat32), shape_out, schema::Format_NC4HW4);
-  std::vector<lite::tensor::Tensor *> inputs{tensor_a.get(), tensor_b.get(), tensor_c.get()};
-  std::vector<lite::tensor::Tensor *> outputs{tensor_d.get()};
-
-  // freamework to do!!!
-  inputs[1]->SetData(packed_weight);
-  inputs[2]->SetData(bias_data);
-
-  OpParameter *parameter = reinterpret_cast<OpParameter *>(conv_param.get());
-  auto pKernel = std::make_unique<kernel::DepthwiseConv2dOpenCLKernel>(parameter, inputs, outputs);
-  pKernel->Init();
-
-  std::vector<kernel::LiteKernel *> kernels{pKernel.get()};
-  std::vector<lite::tensor::Tensor *> inputs_{tensor_a.get()};
-  inputs[0]->MallocData();
-  auto pGraph = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_, outputs, kernels, kernels, kernels);
-  pGraph->Init();
-
-  // freamework to do!!!
-  memcpy(inputs[0]->Data(), packed_input.get(), sizeof(float) * pack_input_size);
-
-  pGraph->Run();
-  float *packed_output = reinterpret_cast<float *>(outputs[0]->Data());
-
   // pack correct data, nhwc
-  float correct_data[] = {1.189188,   1.0425153,  1.8012011,  0.6074867,  1.2120346,  1.5005531,  0.8346756, 2.4365785,
+  float16_t gnd_data[] = {1.189188,   1.0425153,  1.8012011,  0.6074867,  1.2120346,  1.5005531,  0.8346756, 2.4365785,
                           0.54975945, 1.6815965,  1.2690231,  0.60214907, 1.6158017,  0.42115876, 0.8854959, 1.1709145,
                           1.0929465,  1.3534508,  1.1985044,  1.2932993,  2.4621446,  1.7086457,  2.6977584, 2.1960166,
                           2.3769147,  2.3185873,  0.6133741,  0.9687358,  0.9987654,  1.0254729,  0.8368954, 0.74171704,
                           0.8749627,  0.8953936,  0.5093431,  1.5496738,  0.54936385, 0.7683113,  1.165742,  1.3682933,
                           1.0517888,  0.59817517, 0.75649744, 1.2075498,  0.38804203};
-  auto packed_correct_data = std::make_unique<float>(packed_output_size);
-  memset(packed_correct_data.get(), 0, packed_output_size * sizeof(float));
-  PackNHWCToNC4HW4Fp32(correct_data, packed_correct_data.get(), conv_param->output_batch_,
-                       conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_);
-
-  printf("==================input_data=================\n");
-  std::cout << std::endl;
-  for (int i = 0; i < pack_input_size; i++) {
-    std::cout << packed_input.get()[i] << ", ";
-  }
-  std::cout << std::endl;
-  printf("==================weight data=================\n");
-  std::cout << std::endl;
-  for (int i = 0; i < pack_weight_size; i++) {
-    std::cout << packed_weight[i] << ", ";
-  }
-  std::cout << std::endl;
-  printf("==================output data=================\n");
-  std::cout << std::endl;
-  for (int i = 0; i < packed_output_size; i++) {
-    std::cout << packed_output[i] << ", ";
-  }
-  std::cout << std::endl;
-  printf("==================expected output data=================\n");
-  for (int i = 0; i < packed_output_size; i++) {
-    std::cout << packed_correct_data.get()[i] << ", ";
-  }
-  std::cout << std::endl;
-  // compare
-  CommonTest::CompareOutputData(packed_output, packed_correct_data.get(), packed_output_size, 0.00001);
 
-  inputs[1]->SetData(nullptr);
-  inputs[2]->SetData(nullptr);
-  MS_LOG(INFO) << "TestConvolutionDwPadFp32 passed";
+  lite::opencl::OpenCLRuntime::GetInstance()->SetFp16Enable(true);
+  DepthWiseTestMain<float16_t, float16_t>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4,
+                                          kNumberTypeFloat16, true, 1e-2);
 }
 
-TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) {
+TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2Fp32) {
   std::vector<std::vector<int>> src_shape{
     {1, 32, 112, 112}, {1, 96, 112, 112}, {1, 144, 56, 56}, {1, 144, 56, 56}, {1, 192, 28, 28},
     {1, 192, 28, 28},  {1, 384, 14, 14},  {1, 576, 14, 14}, {1, 576, 14, 14}, {1, 960, 7, 7},
@@ -685,101 +538,56 @@ TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) {
   };
 
   // nhwc
-  size_t in_size = 96 * 112 * 112;
-  auto input_data = std::make_unique<float_t>(in_size);
-  memset(input_data.get(), 0, in_size);
+  const size_t in_size = 96 * 112 * 112;
+  float *input_data = new (std::nothrow) float[in_size];
+  if (input_data == nullptr) {
+    return;
+  }
+  memset(input_data, 0, in_size * sizeof(float_t));
   for (auto i = 0; i < in_size; ++i) {
-    input_data.get()[i] = 1;
+    input_data[i] = 1;
   }
   // co h w ci
-  size_t wt_size = 576 * 3 * 3;
-  auto weight_data = std::make_unique<float_t>(wt_size);
-  memset(weight_data.get(), 0, wt_size);
+  const size_t wt_size = 576 * 3 * 3;
+  float *weight_data = new (std::nothrow) float[wt_size];
+  if (weight_data == nullptr) {
+    delete [] input_data;
+    return;
+  }
+  memset(weight_data, 0, wt_size);
   for (auto i = 0; i < wt_size; ++i) {
-    weight_data.get()[i] = 1;
+    weight_data[i] = 1;
   }
-  size_t out_size = 96 * 112 * 112;
-  auto gnd_data = std::make_unique<float_t>(out_size);
-  memset(gnd_data.get(), 0, out_size);
-  //  for (auto i = 0; i < in_size; ++i) {
-  //    gnd_data[i] = 1;
-  //  }
   for (size_t i = 0; i < src_shape.size(); ++i) {
     const int MAX_RUN_TIMES = 1;
     for (int j = 0; j < MAX_RUN_TIMES; ++j) {
       printf("========profiling depthwise, in shape(%d,%d,%d,%d), out shape(%d,%d,%d,%d), iter%d========\n",
              src_shape[i][0], src_shape[i][1], src_shape[i][2], src_shape[i][3], dst_shape[i][0], dst_shape[i][1],
              dst_shape[i][2], dst_shape[i][3], j);
-      auto conv_param = std::make_unique<ConvParameter>();
+      auto conv_param = ConvParameter();
       {
-        conv_param->input_batch_ = 1;
-        conv_param->input_h_ = src_shape[i][2];
-        conv_param->input_w_ = src_shape[i][3];
-        conv_param->input_channel_ = src_shape[i][1];
-        conv_param->output_batch_ = 1;
-        conv_param->output_h_ = dst_shape[i][2];
-        conv_param->output_w_ = dst_shape[i][3];
-        conv_param->output_channel_ = dst_shape[i][1];
-        conv_param->kernel_h_ = filter_shape[i][1];
-        conv_param->kernel_w_ = filter_shape[i][2];
-        conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_;
-        conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_;
-        conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2;
-        conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2;
-        conv_param->dilation_h_ = 1;
-        conv_param->dilation_w_ = 1;
+        conv_param.input_batch_ = 1;
+        conv_param.input_h_ = src_shape[i][2];
+        conv_param.input_w_ = src_shape[i][3];
+        conv_param.input_channel_ = src_shape[i][1];
+        conv_param.output_batch_ = 1;
+        conv_param.output_h_ = dst_shape[i][2];
+        conv_param.output_w_ = dst_shape[i][3];
+        conv_param.output_channel_ = dst_shape[i][1];
+        conv_param.kernel_h_ = filter_shape[i][1];
+        conv_param.kernel_w_ = filter_shape[i][2];
+        conv_param.stride_h_ = conv_param.output_h_ / conv_param.input_h_;
+        conv_param.stride_w_ = conv_param.output_w_ / conv_param.input_w_;
+        conv_param.pad_u_ = (conv_param.kernel_h_ - 1) / 2;
+        conv_param.pad_l_ = (conv_param.kernel_w_ - 1) / 2;
+        conv_param.dilation_h_ = 1;
+        conv_param.dilation_w_ = 1;
       }
-      //      DepthWiseTestMain(conv_param, input_data, weight_data, gnd_data, schema::Format_NC4HW4, false);
-      DepthWiseTestMain(conv_param.get(), input_data.get(), weight_data.get(), nullptr, schema::Format_NHWC4, false);
+      DepthWiseTestMain<float, float>(&conv_param, input_data, weight_data, nullptr, schema::Format_NHWC4,
+                                      kNumberTypeFloat32, false);
     }
   }
-}
-
-TEST_F(TestConvolutionDwOpenCL, Buffer2Image) {
-  std::vector<int> src_shape{1, 96, 64, 64};
-  std::vector<int> dst_shape{1, 96, 32, 32};
-  std::vector<int> filter_shape{96, 3, 3, 1};
-
-  // nhwc
-  size_t in_size = 96 * 112 * 112;
-  auto input_data = std::make_unique<float_t>(in_size);
-  memset(input_data.get(), 0, in_size);
-  for (auto i = 0; i < in_size; ++i) {
-    input_data.get()[i] = 1;
-  }
-  // co h w ci
-  size_t wt_size = 576 * 3 * 3;
-  auto weight_data = std::make_unique<float_t>(wt_size);
-  memset(weight_data.get(), 0, wt_size);
-  for (auto i = 0; i < wt_size; ++i) {
-    weight_data.get()[i] = 1;
-  }
-  size_t out_size = 96 * 112 * 112;
-  auto gnd_data = std::make_unique<float_t>(out_size);
-  memset(gnd_data.get(), 0, out_size);
-  //  for (auto i = 0; i < in_size; ++i) {
-  //    gnd_data[i] = 1;
-  //  }
-  auto conv_param = std::make_unique<ConvParameter>();
-  {
-    conv_param->input_batch_ = 1;
-    conv_param->input_h_ = src_shape[2];
-    conv_param->input_w_ = src_shape[3];
-    conv_param->input_channel_ = src_shape[1];
-    conv_param->output_batch_ = 1;
-    conv_param->output_h_ = dst_shape[2];
-    conv_param->output_w_ = dst_shape[3];
-    conv_param->output_channel_ = dst_shape[1];
-    conv_param->kernel_h_ = filter_shape[1];
-    conv_param->kernel_w_ = filter_shape[2];
-    conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_;
-    conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_;
-    conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2;
-    conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2;
-    conv_param->dilation_h_ = 1;
-    conv_param->dilation_w_ = 1;
-  }
-  //      DepthWiseTestMain(conv_param, input_data, weight_data, gnd_data, schema::Format_NC4HW4, true);
-  DepthWiseTestMain(conv_param.get(), input_data.get(), weight_data.get(), gnd_data.get(), schema::Format_NHWC4, true);
+  delete [] input_data;
+  delete [] weight_data;
 }
 }  // namespace mindspore