diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 6c5106db25c0f12cb625b6e5e0c80c0497541804..9f1ed60afc91ad4f2dfdcd13aa6eebf8fd2839b6 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -17,7 +17,7 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
 
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(output->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(output->dtype()));
   auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options);
 
   const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index c7cd37e3ec7e6c1e0dbe31cf335bb105869e35c2..8b6804dce3cfef66103de6256991cb4b12ef0fc6 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -30,7 +30,7 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
 
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
   auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
 
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 61faa995ce86792a302068af11aed7b784b2834f..f3af3d22622bd5e893347d958da76dbec71a450a 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -24,8 +24,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
   }
 
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(DataTypeToEnum<T>::value));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(DataTypeToEnum<T>::value));
+  if (buffer->dtype() == image->dtype()) {
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
+  } else {
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+  }
   auto runtime = OpenCLRuntime::Get();
   string kernel_name;
   switch (type) {
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index c3a17c7bd77b760b3e9dfe31c9a3158e3348db58..d759689c6dc1ee8ffbfa98f2a4a58577a50c4271 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -34,8 +34,8 @@ void Conv1x1(const Tensor *input,
   MACE_CHECK(input_batch == batch);
 
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
   built_options.emplace("-DSTRIDE=" + ToString(stride));
   if (bias != nullptr) {
     built_options.emplace("-DBIAS");
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index e29c4d92f7ee52ad6db3b9714e1fe94749a4c3d4..24bf90a1178961665ec1cf65935809d8409987bd 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -26,8 +26,8 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
   const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
 
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
   built_options.emplace(bias != nullptr ? "-DBIAS" : "");
   built_options.emplace("-DSTRIDE=" + ToString(stride));
   if (fused_relu) {
diff --git a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
index 60ce2a829a78a0a0439dd1e287c61f2dee4b490b..1402131df164cb0d1ba348617b3988e78f71c574 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -32,7 +32,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
   built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
   built_options.emplace(bias != nullptr ? "-DBIAS" : "");
   auto conv_kernel  = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options);
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 4f4d1c56147df61da58a5a6478f1958e2b289a39..2c1dc264bd5ac1ddaeeaf47ea54a6e8b9e32e13a 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -54,34 +54,42 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
 }
 
 
-std::string DataTypeToCLType(const DataType dt) {
+std::string DtToCLDt(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+      return "float";
+    case DT_HALF:
+      return "half";
+    default:
+      LOG(FATAL) << "Unsupported data type";
+      return "";
+  }
+}
+
+std::string DtToCLCMDDt(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+      return "f";
+    case DT_HALF:
+      return "h";
+    default:
+      LOG(FATAL) << "Not supported data type for opencl cmd data type";
+      return "";
+  }
+}
+
+std::string DtToUpstreamCLDt(const DataType dt) {
   switch (dt) {
     case DT_FLOAT:
     case DT_HALF:
       return "float";
-    case DT_UINT8:
-      return "uchar";
-    case DT_INT8:
-      return "char";
-    case DT_DOUBLE:
-      return "double";
-    case DT_INT32:
-      return "int";
-    case DT_UINT32:
-      return "int";
-    case DT_UINT16:
-      return "ushort";
-    case DT_INT16:
-      return "short";
-    case DT_INT64:
-      return "long";
     default:
       LOG(FATAL) << "Unsupported data type";
       return "";
   }
 }
 
-std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
+std::string DtToUpstreamCLCMDDt(const DataType dt) {
   switch (dt) {
     case DT_FLOAT:
     case DT_HALF:
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 1ad94aa5d2545f059ec785c0b4ec36a87155fb49..70d74e5886c61a50c0a5fb684d02ecc6e00403cd 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -19,10 +19,13 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                      const BufferType type,
                      std::vector<size_t> &image_shape);
 
-std::string DataTypeToOPENCLCMDDataType(const DataType dt);
+std::string DtToCLCMDDt(const DataType dt);
 
-std::string DataTypeToCLType(const DataType dt);
+std::string DtToUpstreamCLCMDDt(const DataType dt);
 
+std::string DtToCLDt(const DataType dt);
+
+std::string DtToUpstreamCLDt(const DataType dt);
 
 }  // namespace kernels
 } //  namespace mace
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 0aaa89ae2c649583dddafaffbcce428d4ffc94fd..fb9216f767dfd2770a6ccfc405283e51dea2ffe5 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -32,7 +32,7 @@ static void Pooling3(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
   built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : "");
   auto pooling_kernel  = runtime->BuildKernel("pooling", "pooling3", built_options);
 
@@ -80,7 +80,7 @@ static void PoolingN(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
   auto pooling_kernel  = runtime->BuildKernel("pooling", "poolingn", built_options);
 
   const uint32_t lws[3] = {1, 8, 128};
diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc
index 1149b965a2fc91c5394c97b7028d872b827dc125..e7f527a5380a8f965d3781335f4b2a580fdcd3e7 100644
--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -23,7 +23,7 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
   auto program = runtime->program();
 
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
   if (max_limit_ < 0) {
     auto relu_kernel  = runtime->BuildKernel("relu", "relu", built_options);
     const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index 67d74ed32f1217748a412a0ecb609d0f7e60dc7a..27dd8e62b96422c368e324d249900b5e8d5f7767 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -41,8 +41,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
   auto dt = DataTypeToEnum<T>::value;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
   auto rb_kernel  = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);
 
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
diff --git a/mace/kernels/opencl/space_to_batch_opecl.cc b/mace/kernels/opencl/space_to_batch_opecl.cc
index 2716501c880fcd4fb2232e292b9396e27cfff2f3..72590be5e87ca1c5b721972855b8869e397df82c 100644
--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opecl.cc
@@ -20,7 +20,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
                                                                 Tensor *batch_tensor) {
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(space_tensor->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(space_tensor->dtype()));
   auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);
 
   uint32_t idx = 0;
diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc
index 7bd667ca3988320529a702224e3045a99ca38de8..3836a7ae90291dbbfb80da20cf78a1bb1c79d87e 100644
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -43,7 +43,7 @@ TEST(BufferToImageTest, ArgSmall) {
 }
 
 TEST(BufferToImageTest, ArgHalfSmall) {
-  TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {1});
+  TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
 }
 
 TEST(BufferToImageTest, ArgMedia) {
@@ -97,3 +97,37 @@ TEST(BufferToImageTest, Filter3x3Meida) {
 TEST(BufferToImageTest, Filter3x3Large) {
   TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256});
 }
+
+template<DeviceType D, typename T>
+void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
+  OpsTestNet net;
+  OpDefBuilder("BufferToImage", "BufferToImageTest")
+      .Input("Input")
+      .Output("B2IOutput")
+      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddRandomInput<D, float>("Input", input_shape);
+
+  // Run
+  net.RunOp(D);
+
+  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
+      .Input("B2IOutput")
+      .Output("I2BOutput")
+      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
+      .Finalize(net.NewOperatorDef());
+
+  // Run
+  net.RunOp(D);
+
+  // Check
+  ExpectTensorNear<float, T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2);
+}
+
+TEST(BufferToImageTest, ArgFloatToHalfSmall) {
+  TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
+}