diff --git a/mace/core/half.h b/mace/core/half.h
index dde806fb153f76982f26f1c9d6beb28eab516ab2..9df24bd43956aa56b5de833800d63cdda5281269 100644
--- a/mace/core/half.h
+++ b/mace/core/half.h
@@ -1098,7 +1098,7 @@ namespace half_float
 
 		/// Conversion constructor.
 		/// \param rhs float to convert
-		explicit half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
+		half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
 	
 		/// Conversion to single-precision.
 		/// \return single precision value representing expression value
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index a717c6a48513eb075ae4b36124213a109a7f4786..e9a41cfcafef011da308a4df81b3dbc79874bfb2 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -11,13 +11,23 @@
 namespace mace {
 namespace kernels {
 
+struct Conv2dFunctorBase {
+  Conv2dFunctorBase(const int *strides,
+                    const Padding &paddings,
+                    const int *dilations)
+      : strides_(strides), dilations_(dilations), paddings_(paddings) {}
+
+  const int *strides_;         // [stride_h, stride_w]
+  const int *dilations_;       // [dilation_h, dilation_w]
+  Padding paddings_;
+};
+
 template<DeviceType D, typename T>
-struct Conv2dFunctor {
-  Conv2dFunctor() {}
+struct Conv2dFunctor : Conv2dFunctorBase {
   Conv2dFunctor(const int *strides,
                 const Padding &paddings,
                 const int *dilations)
-      : strides_(strides), dilations_(dilations), paddings_(paddings) {}
+      : Conv2dFunctorBase(strides, paddings, dilations) {}
 
   void operator()(const Tensor *input,
                   const Tensor *filter,
@@ -76,9 +86,10 @@ struct Conv2dFunctor {
       for (int h = 0; h < height; ++h) {
         for (int w = 0; w < width; ++w) {
           for (int c = 0; c < channels; ++c) {
-            T bias_channel = bias_data ? bias_data[c] : 0;
+            T bias_channel = 0.0f;
+            if (bias) bias_channel = bias_data[c];
             *output_data = bias_channel;
-            T sum = 0;
+            T sum = 0.0f;
             const T *filter_ptr = filter_data + c;
             for (int kh = 0; kh < kernel_h; ++kh) {
               for (int kw = 0; kw < kernel_w; ++kw) {
@@ -113,9 +124,6 @@ struct Conv2dFunctor {
 
   }
 
-  const int *strides_;         // [stride_h, stride_w]
-  const int *dilations_;       // [dilation_h, dilation_w]
-  Padding paddings_;
 };
 
 template<>
@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                                                         const Tensor *filter,
                                                         const Tensor *bias,
                                                         Tensor *output);
-template<>
-void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
-                                                          const Tensor *filter,
-                                                          const Tensor *bias,
-                                                          Tensor *output);
+
+template<typename T>
+struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
+  Conv2dFunctor(const int *strides,
+                const Padding &paddings,
+                const int *dilations)
+      : Conv2dFunctorBase(strides, paddings, dilations) {}
+
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  Tensor *output);
+};
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 511e4598309561a5a453113784db9de4d933399b..61faa995ce86792a302068af11aed7b784b2834f 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -24,8 +24,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
   }
 
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype()));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(image->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(DataTypeToEnum<T>::value));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(DataTypeToEnum<T>::value));
   auto runtime = OpenCLRuntime::Get();
   string kernel_name;
   switch (type) {
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index 689a7df2f7663e3ed509faedc08eb871ba9c2a1b..7aaf367c560b5ef24242d18972dfd96ab1db8d61 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -10,7 +10,10 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                           __read_only image2d_t bn_offset, /* cout%4 * cout/4 */
 #endif
                           __write_only image2d_t output,
+                          __private const int in_height,
+                          __private const int in_width,
                           __private const int in_ch_blks,
+                          __private const int height,
                           __private const int width) {
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
@@ -32,24 +35,37 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #endif
 
   int4 w;
+#if STRIDE == 1
   w.x = out_w_blk;
   w.y = w.x + out_w_blks;
   w.z = w.y + out_w_blks;
   w.w = w.z + out_w_blks;
+  int out_hb_idx = (out_hb % height);
+#else
+  w.x = out_w_blk * 2;
+  w.y = (out_w_blk + out_w_blks) * 2;
+  w.z = (out_w_blk + 2 * out_w_blks) * 2;
+  w.w = (out_w_blk + 3 * out_w_blks) * 2;
+  int out_hb_idx = (out_hb % height) * 2;
+#endif
+
+  w.x = select(w.x, INT_MIN, w.x >= in_width);
+  w.y = select(w.y, INT_MIN, w.y >= in_width);
+  w.z = select(w.z, INT_MIN, w.z >= in_width);
+  w.w = select(w.w, INT_MIN, w.w >= in_width);
+
+  out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
+                      -1,
+                      out_hb_idx >= in_height);
 
   // Unrolling this loop hurt perfmance
   int in_x_base = 0;
   for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb));
-    DATA_TYPE4 in1 = 0;
-    DATA_TYPE4 in2 = 0;
-    DATA_TYPE4 in3 = 0;
-    if (w.y < width) {
-      // conditional load hurt perf, this branching helps sometimes
-      in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb));
-      in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb));
-      in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb));
-    }
+
+    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
+    DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
+    DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
+    DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
 
     const int filter_x0 = in_ch_blk << 2;
     DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
@@ -78,7 +94,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
     out3 += in3.z * weights2;
     out3 += in3.w * weights3;
 
-    in_x_base += width;
+    in_x_base += in_width;
   }
 
 #ifdef FUSED_BATCH_NORM
@@ -111,14 +127,19 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #endif
 
   const int out_x_base = out_ch_blk * width;
-  WRITE_IMAGET(output, (int2)(out_x_base + w.x, out_hb), out0);
+  int out_x_idx = out_w_blk;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
+
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1);
 
-  if (w.y >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w.y, out_hb), out1);
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2);
 
-  if (w.z >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w.z, out_hb), out2);
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
 
-  if (w.w >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w.w, out_hb), out3);
 }
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index ef76aa79c56f310f77eda05a0a97c3091d9faba2..81d39c1eb8a08254d26112341bdeed827be32c39 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -20,143 +20,135 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   const int rounded_in_ch = in_ch_blks * 4;
 
   const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  VEC_DATA_TYPE(DATA_TYPE, 4) out[5] = {0};
 #ifdef BIAS
-  out[0] =
-      CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0));
-  out[1] = out[0];
-  out[2] = out[0];
-  out[3] = out[0];
-  out[4] = out[0];
+  DATA_TYPE4 out0 =
+     READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+  DATA_TYPE4 out1 = out0;
+  DATA_TYPE4 out2 = out0;
+  DATA_TYPE4 out3 = out0;
+  DATA_TYPE4 out4 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+  DATA_TYPE4 out4 = 0;
+#endif
+
+#if STRIDE == 1
+  int in_width0 = out_w_blk - padding_left;
+  int in_width1 = in_width0 + out_w_blks;
+  int in_width2 = in_width1 + out_w_blks;
+  int in_width3 = in_width2 + out_w_blks;
+  int in_width4 = in_width3 + out_w_blks;
+  const int height_idx = (out_hb % out_height) - padding_top;
+#else
+  int in_width0 = out_w_blk * 2 - padding_left;
+  int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
+  int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
+  int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
+  int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left;
+  const int height_idx = (out_hb % out_height) * 2 - padding_top;
 #endif
 
-  int w[5];
-  w[0] = out_w_blk - padding_left;
-  w[1] = w[0] + out_w_blks;
-  w[2] = w[1] + out_w_blks;
-  w[3] = w[2] + out_w_blks;
-  w[4] = w[3] + out_w_blks;
-
-  const int batch_idx = out_hb / out_height;
-  const int height_idx = out_hb % out_height;
-  int in_hb[3];
-  in_hb[0] = height_idx - padding_top;
-  in_hb[1] = in_hb[0] + 1;
-  in_hb[2] = in_hb[1] + 1;
-  // Judge the height border for padding input.
-  in_hb[0] = (in_hb[0] < 0 || in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height;
-  in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height;
-  in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height;
-
-  const int input_image_width = in_ch_blks * in_width;
-
-  VEC_DATA_TYPE(DATA_TYPE, 4) in[5];
-  VEC_DATA_TYPE(DATA_TYPE, 4) weights[4];
+  const int batch_idx = (out_hb / out_height) * in_height;
+
+  DATA_TYPE4 in0, in1, in2, in3, in4;
+  DATA_TYPE4 weights0, weights1, weights2, weights3;
   int in_idx, hb_idx, width_idx, in_width_idx;
   // Unrolling this loop hurt perfmance
-  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    for (int i = 0; i < 9; ++i) {
-
-      in_idx = in_ch_blk * in_width;
-
-      hb_idx = i / 3;
-      width_idx = i % 3;
-      in_width_idx = w[0] + width_idx;
-      // Judge the width border for padding input.
-      if (in_width_idx < 0 || in_width_idx >= in_width) {
-        in[0] = 0;
-      } else {
-        in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
-      }
-      in_width_idx = w[1] + width_idx;
-      if (in_width_idx < 0 || in_width_idx >= in_width) {
-        in[1] = 0;
-      } else {
-        in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
+      for (short width_idx = 0; width_idx < 3; ++width_idx) {
+
+        in_idx = in_ch_blk * in_width;
+
+        int in_hb_value = height_idx + hb_idx;
+        in_hb_value = select(in_hb_value + batch_idx,
+                             -1,
+                             (in_hb_value < 0 || in_hb_value >= in_height));
+
+        int in_width_value;
+#define READ_INPUT(i)                                                                \
+        in_width_value = in_width##i + width_idx;                                    \
+        in_width_value = select(in_idx + in_width_value,                             \
+                                -1,                                                  \
+                                (in_width_value < 0 || in_width_value >= in_width)); \
+        in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
+
+        READ_INPUT(0);
+        READ_INPUT(1);
+        READ_INPUT(2);
+        READ_INPUT(3);
+        READ_INPUT(4);
+
+#undef READ_INPUT
+
+        int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
+        weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
+        weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
+        weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
+        weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+
+        // Will prefetch L2 improve performance? How to pretch image data?
+
+        // Interleaving load and mul does not improve performance as expected
+        out0 += in0.x * weights0;
+        out0 += in0.y * weights1;
+        out0 += in0.z * weights2;
+        out0 += in0.w * weights3;
+
+        out1 += in1.x * weights0;
+        out1 += in1.y * weights1;
+        out1 += in1.z * weights2;
+        out1 += in1.w * weights3;
+
+        out2 += in2.x * weights0;
+        out2 += in2.y * weights1;
+        out2 += in2.z * weights2;
+        out2 += in2.w * weights3;
+
+        out3 += in3.x * weights0;
+        out3 += in3.y * weights1;
+        out3 += in3.z * weights2;
+        out3 += in3.w * weights3;
+
+        out4 += in4.x * weights0;
+        out4 += in4.y * weights1;
+        out4 += in4.z * weights2;
+        out4 += in4.w * weights3;
       }
-      in_width_idx = w[2] + width_idx;
-      if (in_width_idx < 0 || in_width_idx >= in_width) {
-        in[2] = 0;
-      } else {
-        in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
-      }
-      in_width_idx = w[3] + width_idx;
-      if (in_width_idx < 0 || in_width_idx >= in_width) {
-        in[3] = 0;
-      } else {
-        in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
-      }
-      in_width_idx = w[4] + width_idx;
-      if (in_width_idx < 0 || in_width_idx >= in_width) {
-        in[4] = 0;
-      } else {
-        in[4] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
-      }
-
-
-      int filter_idx = (in_ch_blk << 2) + i * rounded_in_ch;
-      weights[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-      weights[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
-      weights[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
-      weights[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
-
-      // Will prefetch L2 improve performance? How to pretch image data?
-
-      // Interleaving load and mul does not improve performance as expected
-      out[0] += in[0].x * weights[0];
-      out[0] += in[0].y * weights[1];
-      out[0] += in[0].z * weights[2];
-      out[0] += in[0].w * weights[3];
-
-      out[1] += in[1].x * weights[0];
-      out[1] += in[1].y * weights[1];
-      out[1] += in[1].z * weights[2];
-      out[1] += in[1].w * weights[3];
-
-      out[2] += in[2].x * weights[0];
-      out[2] += in[2].y * weights[1];
-      out[2] += in[2].z * weights[2];
-      out[2] += in[2].w * weights[3];
-
-      out[3] += in[3].x * weights[0];
-      out[3] += in[3].y * weights[1];
-      out[3] += in[3].z * weights[2];
-      out[3] += in[3].w * weights[3];
-
-      out[4] += in[4].x * weights[0];
-      out[4] += in[4].y * weights[1];
-      out[4] += in[4].z * weights[2];
-      out[4] += in[4].w * weights[3];
     }
   }
 
   const int out_x_base = out_ch_blk * out_width;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[0] + padding_left, out_hb),
-                                       out[0]);
-
-  w[1] += padding_left;
-  if (w[1] >= out_width) return;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[1], out_hb),
-                                       out[1]);
-
-  w[2] += padding_left;
-  if (w[2] >= out_width) return;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[2], out_hb),
-                                       out[2]);
-
-  w[3] += padding_left;
-  if (w[3] >= out_width) return;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[3], out_hb),
-                                       out[3]);
-
-  w[4] += padding_left;
-  if (w[4] >= out_width) return;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[4], out_hb),
-                                       out[4]);
+  int w = out_w_blk;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out0);
+
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out1);
+
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out2);
+
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out3);
+
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out4);
+
 }
diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc
index 528928e618abf37a0220ed1d9ebf6a5a7c602564..46066b01337db0a8567119259c1efc45d2a97e6e 100644
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -10,33 +10,33 @@ namespace kernels {
 
 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, const int *padding,
-                               Tensor *output);
+                               const DataType dt, Tensor *output);
 
 extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, const int *padding,
-                               Tensor *output);
+                               const DataType dt, Tensor *output);
 
 extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, const int *padding,
-                               Tensor *output);
+                               const DataType dt, Tensor *output);
 
 extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, const int *padding,
-                               Tensor *output);
+                               const DataType dt, Tensor *output);
 
-template <>
-void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
+template<typename T>
+void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                           const Tensor *filter,
                                                           const Tensor *bias,
                                                           Tensor *output) {
   typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
                                        const Tensor *bias, const int *padding,
-                                       Tensor *output);
+                                       DataType dt, Tensor *output);
   // Selection matrix: kernel_size x stride_size
   static const Conv2dOpenclFunction selector[5][2] = {
       {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
       {nullptr, nullptr},
-      {Conv2dOpenclK3x3S1, nullptr},
+      {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
       {nullptr, nullptr},
       {nullptr, nullptr}};
 
@@ -50,7 +50,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
                  << " stride " << strides_[0] << "x" << strides_[1]
                  << " is not implemented yet, using slow version";
     // TODO(heliangliang) The CPU/NEON kernel should map the buffer
-    Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
+    Conv2dFunctor<DeviceType::CPU, T>(strides_, paddings_, dilations_)(
         input, filter, bias, output);
     return;
   }
@@ -70,8 +70,11 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
   }
 
   auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-  conv2d_func(input, filter, bias, paddings.data(), output);
+  conv2d_func(input, filter, bias, paddings.data(), DataTypeToEnum<T>::value, output);
 }
 
+template struct Conv2dFunctor<DeviceType::OPENCL, float>;
+template struct Conv2dFunctor<DeviceType::OPENCL, half>;
+
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index e8a88db510173d47d1c68d7cfb1c2678a10eb91a..e089e91251967a92a7b5a2bbfba340358bfbeb04 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input,
              const Tensor *filter,
              const Tensor *bias,
              const int stride,
+             const DataType dt,
              Tensor *output) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -29,13 +30,11 @@ void Conv1x1(const Tensor *input,
   const index_t width_blocks = RoundUpDiv4(width);
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
 
-  MACE_CHECK(stride == 1);
   MACE_CHECK(input_batch == batch);
-  MACE_CHECK(stride != 1 || (input_height == height && input_width == width));
 
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt));
   built_options.emplace("-DSTRIDE=" + ToString(stride));
   if (bias != nullptr) {
     built_options.emplace("-DBIAS");
@@ -54,7 +53,10 @@ void Conv1x1(const Tensor *input,
     conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
   }
   conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_height));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_width));
   conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
   conv_2d_kernel.setArg(idx++, static_cast<int>(width));
 
   auto command_queue = runtime->command_queue();
@@ -73,16 +75,18 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
                                const Tensor *filter,
                                const Tensor *bias,
                                const int *padding,
+                               const DataType dt,
                                Tensor *output) {
-  Conv1x1(input, filter, bias, 1, output);
+  Conv1x1(input, filter, bias, 1, dt, output);
 };
 
 extern void Conv2dOpenclK1x1S2(const Tensor *input,
                                const Tensor *filter,
                                const Tensor *bias,
                                const int *padding,
+                               const DataType dt,
                                Tensor *output) {
-  Conv1x1(input, filter, bias, 2, output);
+  Conv1x1(input, filter, bias, 2, dt, output);
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 5836caa8842cfc1749b2ea78ca6d55fcf11e2e3b..b280b0420f67b7d96be87cbaa8cf8374fff6a33c 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -13,7 +13,8 @@ namespace kernels {
 
 static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
                          const Tensor *bias, const uint32_t stride,
-                         const int *padding, Tensor *output) {
+                         const int *padding, const DataType dt,
+                         Tensor *output) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -25,9 +26,10 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
   const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
 
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt));
   built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
 
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
@@ -62,12 +64,15 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
 
 }
 void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
-                        const Tensor *bias, const int *padding, Tensor *output) {
-  Conv2d3x3S12(input, filter, bias, 1, padding, output);
+                        const Tensor *bias, const int *padding,
+                        const DataType dt, Tensor *output) {
+  Conv2d3x3S12(input, filter, bias, 1, padding, dt, output);
 };
 
 void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
-                        const Tensor *bias, const int *padding, Tensor *output) {
+                        const Tensor *bias, const int *padding,
+                        const DataType dt, Tensor *output) {
+  Conv2d3x3S12(input, filter, bias, 2, padding, dt, output);
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 05221e55dedde3c7cc17d3f99d2818491d930b87..4f4d1c56147df61da58a5a6478f1958e2b289a39 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -57,9 +57,8 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
 std::string DataTypeToCLType(const DataType dt) {
   switch (dt) {
     case DT_FLOAT:
-      return "float";
     case DT_HALF:
-      return "half";
+      return "float";
     case DT_UINT8:
       return "uchar";
     case DT_INT8:
@@ -85,9 +84,8 @@ std::string DataTypeToCLType(const DataType dt) {
 std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
   switch (dt) {
     case DT_FLOAT:
-      return "f";
     case DT_HALF:
-      return "h";
+      return "f";
     default:
       LOG(FATAL) << "Not supported data type for opencl cmd data type";
       return "";
diff --git a/mace/ops/buffer_to_image.cc b/mace/ops/buffer_to_image.cc
index fe726d1be60c0cd83613fb1834396e01cab9cd04..56711794b7fef1546ec67e63d873289bea2ef1cc 100644
--- a/mace/ops/buffer_to_image.cc
+++ b/mace/ops/buffer_to_image.cc
@@ -14,6 +14,6 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
 REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
                              .TypeConstraint<half>("T")
                              .Build(),
-                         BufferToImageOp<DeviceType::OPENCL, float>);
+                         BufferToImageOp<DeviceType::OPENCL, half>);
 
 }  //  namespace mace
diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc
index ea5fbe21592830bcc31ef303311b15aba3b3a98c..7bd667ca3988320529a702224e3045a99ca38de8 100644
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
       .Input("Input")
       .Output("B2IOutput")
       .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
       .Finalize(net.NewOperatorDef());
 
   // Add input data
@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
       .Input("B2IOutput")
       .Output("I2BOutput")
       .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
       .Finalize(net.NewOperatorDef());
 
   // Run
@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) {
   TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
 }
 
+TEST(BufferToImageTest, ArgHalfSmall) {
+  TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {1});
+}
+
 TEST(BufferToImageTest, ArgMedia) {
   TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
 }
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index 128c849aa9978b569423f3b25afccf5b7c607f8c..617bd2c5600670513f67140979fd3ccee3ed6c98 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -11,6 +11,11 @@ REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
                              .Build(),
                       Conv2dOp<DeviceType::CPU, float>);
 
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
+                          .TypeConstraint<half>("T")
+                          .Build(),
+                      Conv2dOp<DeviceType::CPU, half>);
+
 #if __ARM_NEON
 REGISTER_NEON_OPERATOR(OpKeyBuilder("Conv2D")
                              .TypeConstraint<float>("T")
@@ -23,4 +28,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
                              .Build(),
                          Conv2dOp<DeviceType::OPENCL, float>);
 
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         Conv2dOp<DeviceType::OPENCL, half>);
+
 }  // namespace mace
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 2409013ec322966fe2ffc4c682bc1bcbd17da19b..b7f6fc731dc0e092d74c5ef6b7434e61e79635f1 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -27,15 +27,15 @@ static void Conv2d(int iters,
   OpsTestNet net;
 
   // Add input data
-  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
-  net.AddRandomInput<D, T>("Filter",
+  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  net.AddRandomInput<D, float>("Filter",
                                {kernel_h, kernel_w, channels, output_channels});
-  net.AddRandomInput<D, T>("Bias", {output_channels});
+  net.AddRandomInput<D, float>("Bias", {output_channels});
 
   if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
         .Input("FilterImage")
@@ -44,6 +44,7 @@ static void Conv2d(int iters,
         .AddIntsArg("strides", {stride, stride})
         .AddIntArg("padding", padding)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
   } else {
     OpDefBuilder("Conv2D", "Conv2dTest")
@@ -54,6 +55,7 @@ static void Conv2d(int iters,
         .AddIntsArg("strides", {stride, stride})
         .AddIntArg("padding", padding)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
   }
 
@@ -88,43 +90,42 @@ static void Conv2d(int iters,
       BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
 
 #define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)        \
-  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU);  \
   BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
 
 // ICNet
-BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float);
-BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float);
+BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
 // SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
-BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float);
+BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
 // SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
-BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float);
+BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
 
+BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
-BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, float);
+BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
 
 // Test RGB <-> YUV
-BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
-BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
-
-BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments
-BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
-BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
-BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
-BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
-BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
-BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
-BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
-BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
+//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
+//
+//BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments
+//BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
+//BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
+//BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
+//BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
+//BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
+//BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
 }  //  namespace mace
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index 3ee0dd6085ae2c06d5aeac0d82c9704352b63152..b4fd374b578d3b1eef058f495d331c2182619246 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -98,9 +98,9 @@ void TestNHWCSimple3x3VALID() {
   net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
 
   if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
         .Input("FilterImage")
@@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() {
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
 
     net.RunOp(D);
 
     // Transfer output
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
 
   } else {
     OpDefBuilder("Conv2D", "Conv2dTest")
@@ -125,13 +126,14 @@ void TestNHWCSimple3x3VALID() {
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<T>({1, 1, 1, 1}, {18.1f});
-  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.001);
+  auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
 
 template<DeviceType D, typename T>
@@ -149,9 +151,9 @@ void TestNHWCSimple3x3SAME() {
   net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
 
   if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
         .Input("FilterImage")
@@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() {
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::SAME)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
 
     // Transfer output
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
 
   } else {
     OpDefBuilder("Conv2D", "Conv2dTest")
@@ -176,16 +179,17 @@ void TestNHWCSimple3x3SAME() {
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::SAME)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<T>(
+  auto expected = CreateTensor<float>(
       {1, 3, 3, 1},
       {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
 
-  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
 
 TEST_F(Conv2dOpTest, CPUSimple) {
@@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) {
   TestSimple3x3WithoutBias<DeviceType::NEON>();
 }
 
-template<DeviceType D>
+template<DeviceType D, typename T>
 void TestNHWCSimple3x3WithoutBias() {
   OpsTestNet net;
 
   // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Input", {1, 3, 3, 2},
       {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Filter", {3, 3, 2, 1},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
 
   if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
@@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() {
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
     // Transfer output
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
   } else {
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("Input")
@@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() {
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
 
     // Run
@@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() {
   // Check
   auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.0f});
 
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
 
 TEST_F(Conv2dOpTest, CPUWithoutBias) {
-  TestNHWCSimple3x3WithoutBias<DeviceType::CPU>();
+  TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
 }
 
 TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
-  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL>();
+  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
 }
 
 template<DeviceType D>
@@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) {
   TestCombined3x3<DeviceType::NEON>();
 }
 
-template<DeviceType D>
+template<DeviceType D, typename T>
 static void TestNHWCCombined3x3() {
   // Construct graph
   OpsTestNet net;
 
   // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Filter", {3, 3, 2, 2},
       {1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
        1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
        1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f});
-  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
+  net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f});
 
   if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("InputImage")
@@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() {
         .AddIntsArg("strides", {2, 2})
         .AddIntArg("padding", Padding::SAME)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
 
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
   } else {
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("Input")
@@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() {
         .AddIntsArg("strides", {2, 2})
         .AddIntArg("padding", Padding::SAME)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
@@ -388,16 +396,21 @@ static void TestNHWCCombined3x3() {
       {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f,
                      12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f,
                      8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
+
+}
 
+TEST_F(Conv2dOpTest, CPUStride2) {
+  TestNHWCCombined3x3<DeviceType::CPU, float>();
 }
 
-TEST_F(Conv2dOpTest, CPUCombined) {
-  TestNHWCCombined3x3<DeviceType::CPU>();
+TEST_F(Conv2dOpTest, OPENCLStride2) {
+  TestNHWCCombined3x3<DeviceType::OPENCL, float>();
 }
 
 template<DeviceType D>
 void TestConv1x1() {
+  // Construct graph
   OpsTestNet net;
 
   // Add input data
@@ -415,37 +428,35 @@ void TestConv1x1() {
       {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
   net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
 
-  // Construct graph
   if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
-    OpDefBuilder("Conv2D", "Conv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, float>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
 
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
     net.RunOp(D);
 
-    // Transfer output
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
-
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
   } else {
     OpDefBuilder("Conv2D", "Conv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
-
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
     net.RunOp(D);
   }
 
@@ -470,7 +481,7 @@ TEST_F(Conv2dOpTest, OPENCLConv1x1) {
   TestConv1x1<DeviceType::OPENCL>();
 }
 
-template<DeviceType D>
+template<DeviceType D, typename T>
 static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
@@ -478,7 +489,6 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
     srand(time(NULL));
 
     // generate random input
-    // TODO test all sizes
     index_t batch = 3 + (rand() % 10);
     index_t height = shape[0];
     index_t width = shape[1];
@@ -494,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntArg("padding", type)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
 
     // Add input data
-    net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
-    net.AddRandomInput<D, float>(
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
         "Filter", {kernel_h, kernel_w, input_channels, output_channels});
-    net.AddRandomInput<D, float>("Bias", {output_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
 
     // run on cpu
     net.RunOp();
@@ -509,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
     expected.Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
@@ -521,16 +532,17 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntArg("padding", type)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run on device
     net.RunOp(D);
 
-    ImageToBuffer<D>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
     ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
   };
 
   for (int kernel_size : {1, 3}) {
-    for (int stride : {1}) {
+    for (int stride : {1, 2}) {
       func(kernel_size, kernel_size, stride, stride, VALID);
       func(kernel_size, kernel_size, stride, stride, SAME);
     }
@@ -538,9 +550,90 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
 }
 
 TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 64, 128});
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 32, 64});
 }
 
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL>({107, 113, 5, 7});
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
+}
+
+template<DeviceType D>
+static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
+  testing::internal::LogToStderr();
+  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
+                  Padding type) {
+    srand(time(NULL));
+
+    // generate random input
+    index_t batch = 3 + (rand() % 10);
+    index_t height = shape[0];
+    index_t width = shape[1];
+    index_t input_channels = shape[2] + (rand() % 10);
+    index_t output_channels = shape[3] + (rand() % 10);
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+
+    std::vector<float> float_input_data;
+    GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data);
+    std::vector<float> float_filter_data;
+    GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
+    std::vector<float> float_bias_data;
+    GenerateRandomRealTypeData({output_channels}, float_bias_data);
+    // Add input data
+    net.AddInputFromArray<D, float>("Input", {batch, height, width, input_channels}, float_input_data);
+    net.AddInputFromArray<D, float>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
+    net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
+
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.2);
+  };
+
+  for (int kernel_size : {1, 3}) {
+    for (int stride : {1, 2}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
+    }
+  }
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfAlignedConvNxNS12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConvNxNS12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113, 5, 7});
 }
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 3bdb3ca5cd65152072b5074450331f76eebeadae..8d593940cf0c5059d5064a27c7edb3558b9f559b 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -210,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
                                 std::vector<T> &res) {
   std::random_device rd;
   std::mt19937 gen(rd());
-  std::normal_distribution<T> nd(0, 1);
+  std::normal_distribution<float> nd(0, 1);
 
   index_t size = std::accumulate(shape.begin(), shape.end(), 1,
                                  std::multiplies<index_t>());
   res.resize(size);
 
-  std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
+  if (DataTypeToEnum<T>::value == DT_HALF) {
+    std::generate(res.begin(), res.end(), [&gen, &nd] { return half_float::half_cast<half>(nd(gen)); });
+  } else {
+    std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
+  }
 }
 
 template <typename T>
@@ -290,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) {
   EXPECT_DOUBLE_EQ(a, b);
 }
 
-inline void AssertSameTypeDims(const Tensor &x, const Tensor &y) {
-  ASSERT_EQ(x.dtype(), y.dtype());
+inline void AssertSameDims(const Tensor &x, const Tensor &y) {
   ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs "
                                 << "y.shape [ " << ShapeToString(y) << "]";
 }
 
-template <typename T, bool is_fp = is_floating_point_type<T>::value>
+template <typename EXP_TYPE, typename RES_TYPE, bool is_fp = is_floating_point_type<EXP_TYPE>::value>
 struct Expector;
 
 // Partial specialization for float and double.
-template <typename T>
-struct Expector<T, true> {
-  static void Equal(const T &a, const T &b) { ExpectEqual(a, b); }
+template <typename EXP_TYPE, typename RES_TYPE>
+struct Expector<EXP_TYPE, RES_TYPE, true> {
+  static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
 
   static void Equal(const Tensor &x, const Tensor &y) {
-    ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
-    AssertSameTypeDims(x, y);
+    ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
+    ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
+    AssertSameDims(x, y);
     Tensor::MappingGuard x_mapper(&x);
     Tensor::MappingGuard y_mapper(&y);
-    auto a = x.data<T>();
-    auto b = y.data<T>();
+    auto a = x.data<EXP_TYPE>();
+    auto b = y.data<RES_TYPE>();
     for (int i = 0; i < x.size(); ++i) {
       ExpectEqual(a(i), b(i));
     }
   }
 
   static void Near(const Tensor &x, const Tensor &y, const double abs_err) {
-    ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
-    AssertSameTypeDims(x, y);
+    ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
+    ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
+    AssertSameDims(x, y);
     Tensor::MappingGuard x_mapper(&x);
     Tensor::MappingGuard y_mapper(&y);
-    auto a = x.data<T>();
-    auto b = y.data<T>();
+    auto a = x.data<EXP_TYPE>();
+    auto b = y.data<RES_TYPE>();
     for (int i = 0; i < x.size(); ++i) {
       EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
                                        << " index = " << i;
@@ -335,10 +340,18 @@ template <typename T>
 void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
   static_assert(is_floating_point_type<T>::value,
                 "T is not a floating point type");
-  Expector<T>::Near(x, y, abs_err);
+  Expector<T, T>::Near(x, y, abs_err);
+}
+
+template <typename EXP_TYPE, typename RES_TYPE>
+void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
+  static_assert(is_floating_point_type<EXP_TYPE>::value
+                    && is_floating_point_type<RES_TYPE>::value,
+                "T is not a floating point type");
+  Expector<EXP_TYPE, RES_TYPE>::Near(x, y, abs_err);
 }
 
-template <DeviceType D>
+template <DeviceType D, typename T>
 void BufferToImage(OpsTestNet &net,
                    const std::string &input_name,
                    const std::string &output_name,
@@ -347,6 +360,7 @@ void BufferToImage(OpsTestNet &net,
       .Input(input_name)
       .Output(output_name)
       .AddIntArg("buffer_type", type)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
   // Run
@@ -355,7 +369,7 @@ void BufferToImage(OpsTestNet &net,
   net.Sync();
 }
 
-template <DeviceType D>
+template <DeviceType D, typename T>
 void ImageToBuffer(OpsTestNet &net,
                    const std::string &input_name,
                    const std::string &output_name,
@@ -364,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net,
       .Input(input_name)
       .Output(output_name)
       .AddIntArg("buffer_type", type)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
   // Run