merge with master

99963c98 · yejianwu · 511ee878 · 2b3d78e0 · 99963c98 · 99963c98
20 changed file
--- a/mace/core/opencl_allocator.cc
+++ b/mace/core/opencl_allocator.cc
@@ -54,10 +54,11 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
  cl_int error;
  cl::Image2D *cl_image =
      new cl::Image2D(OpenCLRuntime::Get()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ,
+                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                      img_format,
                      image_shape[0], image_shape[1],
                      0, nullptr, &error);
+  MACE_CHECK(error == CL_SUCCESS);

  return cl_image;
 }

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -17,7 +17,7 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {

  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(output->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(output->dtype()));
  auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options);

  const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -35,8 +35,8 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(

  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(input->dtype()));
  auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);

  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -24,8 +24,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
  }

  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(DataTypeToEnum<T>::value));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(DataTypeToEnum<T>::value));
+  if (buffer->dtype() == image->dtype()) {
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
+  } else {
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+  }
  auto runtime = OpenCLRuntime::Get();
  string kernel_name;
  switch (type) {

--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
 #include <common.h>

-// Supported data type: half/float
-__kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, h, w */
-                                      __global DATA_TYPE *output /* n * c, h, w */,
+__kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+                                      __write_only image2d_t output,
                                      __private const float height_scale,
                                      __private const float width_scale,
                                      __private const int in_height,
-                                      __private const int in_width) {
-  const int c = get_global_id(0);
-  const int h = get_global_id(1);
-  const int w = get_global_id(2);
-  const int channels = get_global_size(0);
-  const int height = get_global_size(1);
-  const int width = get_global_size(2);
+                                      __private const int in_width,
+                                      __private const int out_height) {
+  const int ch_blk = get_global_id(0);
+  const int ch_blks = get_global_size(0);
+  const int w = get_global_id(1);
+  const int out_width = get_global_size(1);
+  const int hb = get_global_id(2);
+  const int b = hb / out_height;
+  const int h = hb % out_height;

  const float h_in = h * height_scale;
  const float w_in = w * width_scale;
@@ -24,16 +25,26 @@ __kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c,
  const float h_lerp = h_in - h_lower;
  const float w_lerp = w_in - w_lower;

-  const DATA_TYPE *input_base = input + c * in_height * in_width;
-  DATA_TYPE *output_base = output + c * height * width;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const int in_w_offset = ch_blk * in_width;
+  const int in_h_offset = b * in_height;

-  DATA_TYPE top_left = input_base[h_lower * in_width + w_lower];
-  DATA_TYPE top_right = input_base[h_lower * in_width + w_upper];
-  DATA_TYPE bottom_left = input_base[h_upper * in_width + w_lower];
-  DATA_TYPE bottom_right = input_base[h_upper * in_width + w_upper];
+  DATA_TYPE4 top_left = READ_IMAGET(input, sampler,
+          (int2)(in_w_offset + w_lower, in_h_offset + h_lower));
+  DATA_TYPE4 top_right = READ_IMAGET(input, sampler,
+          (int2)(in_w_offset + w_upper, in_h_offset + h_lower));
+  DATA_TYPE4 bottom_left = READ_IMAGET(input, sampler,
+          (int2)(in_w_offset + w_lower, in_h_offset + h_upper));
+  DATA_TYPE4 bottom_right = READ_IMAGET(input, sampler,
+          (int2)(in_w_offset + w_upper, in_h_offset + h_upper));

-  const DATA_TYPE top = top_left + (top_right - top_left) * w_lerp;
-  const DATA_TYPE bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
-  output_base[h * width + w] = top + (bottom - top) * h_lerp;
+  DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp;
+  DATA_TYPE4 bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
+
+  DATA_TYPE4 out = top + (bottom - top) * h_lerp;
+
+  const int out_w_offset = ch_blk * out_width;
+  const int out_h_offset = b * out_height;
+  WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
 }

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -34,8 +34,8 @@ void Conv1x1(const Tensor *input,
  MACE_CHECK(input_batch == batch);

  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
  built_options.emplace("-DSTRIDE=" + ToString(stride));
  if (bias != nullptr) {
    built_options.emplace("-DBIAS");

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -26,8 +26,8 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
  const index_t width_blocks = RoundUpDiv<index_t, 5>(width);

  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
  built_options.emplace("-DSTRIDE=" + ToString(stride));
  if (fused_relu) {

--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -32,7 +32,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,

  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
  auto conv_kernel  = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options);

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -54,34 +54,42 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
 }


-std::string DataTypeToCLType(const DataType dt) {
+std::string DtToCLDt(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+      return "float";
+    case DT_HALF:
+      return "half";
+    default:
+      LOG(FATAL) << "Unsupported data type";
+      return "";
+  }
+}
+
+std::string DtToCLCMDDt(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+      return "f";
+    case DT_HALF:
+      return "h";
+    default:
+      LOG(FATAL) << "Not supported data type for opencl cmd data type";
+      return "";
+  }
+}
+
+std::string DtToUpstreamCLDt(const DataType dt) {
  switch (dt) {
    case DT_FLOAT:
    case DT_HALF:
      return "float";
-    case DT_UINT8:
-      return "uchar";
-    case DT_INT8:
-      return "char";
-    case DT_DOUBLE:
-      return "double";
-    case DT_INT32:
-      return "int";
-    case DT_UINT32:
-      return "int";
-    case DT_UINT16:
-      return "ushort";
-    case DT_INT16:
-      return "short";
-    case DT_INT64:
-      return "long";
    default:
      LOG(FATAL) << "Unsupported data type";
      return "";
  }
 }

-std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
+std::string DtToUpstreamCLCMDDt(const DataType dt) {
  switch (dt) {
    case DT_FLOAT:
    case DT_HALF:

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -19,10 +19,13 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,
                     std::vector<size_t> &image_shape);

-std::string DataTypeToOPENCLCMDDataType(const DataType dt);
+std::string DtToCLCMDDt(const DataType dt);

-std::string DataTypeToCLType(const DataType dt);
+std::string DtToUpstreamCLCMDDt(const DataType dt);

+std::string DtToCLDt(const DataType dt);
+
+std::string DtToUpstreamCLDt(const DataType dt);

 }  // namespace kernels
 } //  namespace mace

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -32,7 +32,7 @@ static void Pooling3(const Tensor *input,

  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
  built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : "");
  auto pooling_kernel  = runtime->BuildKernel("pooling", "pooling3", built_options);

@@ -80,7 +80,7 @@ static void PoolingN(const Tensor *input,

  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
  auto pooling_kernel  = runtime->BuildKernel("pooling", "poolingn", built_options);

  const uint32_t lws[3] = {1, 8, 128};

--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -23,7 +23,7 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
  auto program = runtime->program();

  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
  if (max_limit_ < 0) {
    auto relu_kernel  = runtime->BuildKernel("relu", "relu", built_options);
    const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -6,24 +6,33 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/resize_bilinear.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"

 namespace mace {
 namespace kernels {

-template <>
-void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
+template <typename T>
+void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *input, const Tensor *resize_dims, Tensor *output) {
  const index_t batch = input->dim(0);
-  const index_t channels = input->dim(1);
-  const index_t in_height = input->dim(2);
-  const index_t in_width = input->dim(3);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);

  index_t out_height;
  index_t out_width;
  GetOutputSize(resize_dims, &out_height, &out_width);
  MACE_CHECK(out_height > 0 && out_width > 0);
-  std::vector<index_t> out_shape {batch, channels, out_height, out_width};
-  output->Resize(out_shape);
+  std::vector<index_t> output_shape {batch, out_height, out_width, channels};
+  if (input->is_image()) {
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+    output->ResizeImage(output_shape, output_image_shape);
+  } else {
+    output->Resize(output_shape);
+  }

  float height_scale =
      CalculateResizeScale(in_height, out_height, align_corners_);
@@ -31,29 +40,37 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(

  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto dt = DataTypeToEnum<T>::value;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
  auto rb_kernel  = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);

  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
+
  uint32_t idx = 0;
-  rb_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-  rb_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+  rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
  rb_kernel.setArg(idx++, height_scale);
  rb_kernel.setArg(idx++, width_scale);
-  rb_kernel.setArg(idx++, static_cast<int>(in_height));
-  rb_kernel.setArg(idx++, static_cast<int>(in_width));
+  rb_kernel.setArg(idx++, static_cast<int32_t>(in_height));
+  rb_kernel.setArg(idx++, static_cast<int32_t>(in_width));
+  rb_kernel.setArg(idx++, static_cast<int32_t>(out_height));

  auto command_queue = runtime->command_queue();

  cl_int error = command_queue.enqueueNDRangeKernel(
      rb_kernel, cl::NullRange,
-      cl::NDRange(static_cast<int>(batch * channels),
-                  static_cast<int>(out_height), static_cast<int>(out_width)),
-      // TODO (heliangliang) tuning and fix when kwg_size < devisor
-      cl::NDRange(1, 16, kwg_size / 16),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+      cl::NDRange(static_cast<int32_t>(channel_blocks),
+                  static_cast<int32_t>(out_width),
+                  static_cast<int32_t>(out_height * batch)),
+      // TODO tuning
+      cl::NDRange(1, static_cast<int32_t>(out_width > kwg_size ? kwg_size : out_width), 1),
+      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }

+template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
+template struct ResizeBilinearFunctor<DeviceType::OPENCL, half>;
+
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opecl.cc
@@ -20,7 +20,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
                                                                Tensor *batch_tensor) {
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(space_tensor->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(space_tensor->dtype()));
  auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);

  uint32_t idx = 0;

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -61,63 +61,90 @@ void ResizeImage(const T *images,
                 const index_t channels,
                 const std::vector<CachedInterpolation> &xs_vec,
                 const std::vector<CachedInterpolation> &ys,
-                 float *output) {
-  const index_t in_channel_size = in_height * in_width;
-  const index_t in_batch_num_values = channels * in_channel_size;
-  const index_t out_channel_size = out_height * out_width;
-  const index_t out_batch_num_values = channels * out_channel_size;
+                 T *output) {
+  const index_t in_batch_num_values = channels * in_height * in_width;
+  const index_t out_batch_num_values = channels * out_height * out_width;
  const CachedInterpolation *xs = xs_vec.data();

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for
  for (index_t b = 0; b < batch_size; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      const T *input_ptr =
-          images + in_batch_num_values * b + in_channel_size * c;
-      float *output_ptr =
-          output + out_batch_num_values * b + out_channel_size * c;
-      for (index_t y = 0; y < out_height; ++y) {
-        const T *ys_input_lower_ptr = input_ptr + ys[y].lower * in_width;
-        const T *ys_input_upper_ptr = input_ptr + ys[y].upper * in_width;
-        const float ys_lerp = ys[y].lerp;
-        for (index_t x = 0; x < out_width; ++x) {
-          auto xs_lower = xs[x].lower;
-          auto xs_upper = xs[x].upper;
-          auto xs_lerp = xs[x].lerp;
-
-          const float top_left = ys_input_lower_ptr[xs_lower];
-          const float top_right = ys_input_lower_ptr[xs_upper];
-          const float bottom_left = ys_input_upper_ptr[xs_lower];
-          const float bottom_right = ys_input_upper_ptr[xs_upper];
-
-          output_ptr[x] = ComputeLerp(top_left, top_right, bottom_left,
-                                      bottom_right, xs_lerp, ys_lerp);
+    const T *batch_input_ptr = images + in_batch_num_values * b;;
+    T *batch_output_ptr = output + out_batch_num_values * b;
+
+    for (index_t y = 0; y < out_height; ++y) {
+      const T *y_lower_input_ptr =
+        batch_input_ptr + ys[y].lower * in_width * channels;
+      const T *y_upper_input_ptr =
+        batch_input_ptr + ys[y].upper * in_width * channels;
+      T *y_output_ptr = batch_output_ptr + y * out_width * channels;
+      const float ys_lerp = ys[y].lerp;
+
+      for (index_t x = 0; x < out_width; ++x) {
+        const float xs_lerp = xs[x].lerp;
+        const T *top_left_ptr = y_lower_input_ptr + xs[x].lower * channels;
+        const T *top_right_ptr = y_lower_input_ptr + xs[x].upper * channels;
+        const T *bottom_left_ptr = y_upper_input_ptr + xs[x].lower * channels;
+        const T *bottom_right_ptr = y_upper_input_ptr + xs[x].upper * channels;
+        T *output_ptr = y_output_ptr + x * channels;
+
+        for (index_t c = 0; c < channels; ++c) {
+          const T top_left = top_left_ptr[c];
+          const T top_right = top_right_ptr[c];
+          const T bottom_left = bottom_left_ptr[c];
+          const T bottom_right = bottom_right_ptr[c];
+
+          output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left,
+              bottom_right, xs_lerp, ys_lerp);
        }
-        output_ptr += out_width;
      }
    }
  }
 }
 }

+struct ResizeBilinearFunctorBase {
+  ResizeBilinearFunctorBase(const std::vector<index_t> &size,
+                            bool align_corners)
+      : align_corners_(align_corners), size_(size) {}
+
+ protected:
+  void GetOutputSize(const Tensor *resize_dims,
+                     index_t *out_height,
+                     index_t *out_width) {
+    if (size_[0] < 0 || size_[1] < 0) {
+      MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1);
+      Tensor::MappingGuard resize_dims_mapper(resize_dims);
+      auto dims_data = resize_dims->data<int32_t>();
+      *out_height = dims_data[0];
+      *out_width = dims_data[1];
+    } else {
+      *out_height = size_[0];
+      *out_width = size_[1];
+    }
+  }
+
+  bool align_corners_;
+  std::vector<index_t> size_;
+};
+
 template <DeviceType D, typename T>
-class ResizeBilinearFunctor {
- public:
+struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
-      : align_corners_(align_corners), size_(size) {}
+      : ResizeBilinearFunctorBase(size, align_corners) {}

  void operator()(const Tensor *input,
                  const Tensor *resize_dims,
                  Tensor *output) {
    const index_t batch = input->dim(0);
-    const index_t channels = input->dim(1);
-    const index_t in_height = input->dim(2);
-    const index_t in_width = input->dim(3);
+    const index_t in_height = input->dim(1);
+    const index_t in_width = input->dim(2);
+    const index_t channels = input->dim(3);

    index_t out_height;
    index_t out_width;
    GetOutputSize(resize_dims, &out_height, &out_width);
    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> out_shape{batch, channels, out_height, out_width};
+    std::vector<index_t> out_shape{batch, out_height, out_width, channels};
    output->Resize(out_shape);

    Tensor::MappingGuard input_mapper(input);
@@ -146,32 +173,18 @@ class ResizeBilinearFunctor {
    ResizeImage(input_data, batch, in_height, in_width, out_height, out_width,
                channels, xs, ys, output_data);
  }
+};

- protected:
-  void GetOutputSize(const Tensor *resize_dims,
-                     index_t *out_height,
-                     index_t *out_width) {
-    if (size_[0] < 0 || size_[1] < 0) {
-      MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1);
-      Tensor::MappingGuard resize_dims_mapper(resize_dims);
-      auto dims_data = resize_dims->data<int32_t>();
-      *out_height = dims_data[0];
-      *out_width = dims_data[1];
-    } else {
-      *out_height = size_[0];
-      *out_width = size_[1];
-    }
-  }
+template<typename T>
+struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase {
+  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
+      : ResizeBilinearFunctorBase(size, align_corners) {}

- private:
-  bool align_corners_;
-  std::vector<index_t> size_;
+  void operator()(const Tensor *input,
+                  const Tensor *resize_dims,
+                  Tensor *output);
 };

-template <>
-void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
-    const Tensor *input, const Tensor *resize_dims, Tensor *output);
-
 }  // namespace kernels
 }  // namespace mace


--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -43,7 +43,7 @@ TEST(BufferToImageTest, ArgSmall) {
 }

 TEST(BufferToImageTest, ArgHalfSmall) {
-  TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {1});
+  TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
 }

 TEST(BufferToImageTest, ArgMedia) {
@@ -97,3 +97,37 @@ TEST(BufferToImageTest, Filter3x3Meida) {
 TEST(BufferToImageTest, Filter3x3Large) {
  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256});
 }
+
+template<DeviceType D, typename T>
+void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
+  OpsTestNet net;
+  OpDefBuilder("BufferToImage", "BufferToImageTest")
+      .Input("Input")
+      .Output("B2IOutput")
+      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddRandomInput<D, float>("Input", input_shape);
+
+  // Run
+  net.RunOp(D);
+
+  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
+      .Input("B2IOutput")
+      .Output("I2BOutput")
+      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
+      .Finalize(net.NewOperatorDef());
+
+  // Run
+  net.RunOp(D);
+
+  // Check
+  ExpectTensorNear<float, T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2);
+}
+
+TEST(BufferToImageTest, ArgFloatToHalfSmall) {
+  TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
+}
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -23,4 +23,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ResizeBilinear")
                             .Build(),
                         ResizeBilinearOp<DeviceType::OPENCL, float>);

+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ResizeBilinear")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         ResizeBilinearOp<DeviceType::OPENCL, half>);
+
 }  //  namespace mace
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -19,18 +19,30 @@ static void ResizeBilinearBenchmark(int iters,
  mace::testing::StopTiming();

  OpsTestNet net;
-  OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
-      .Input("Input")
-      .Input("OutSize")
-      .Output("Output")
-      .AddIntsArg("size", {output_height, output_width})
-      .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<D, float>("Input",
-                               {batch, channels, input_height, input_width});
+                               {batch, input_height, input_width, channels});
  net.AddInputFromArray<D, index_t>("OutSize", {2},
                                    {output_height, output_width});
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
+      .Input("InputImage")
+      .Input("OutSize")
+      .Output("OutputImage")
+      .AddIntsArg("size", {output_height, output_width})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
+      .Input("Input")
+      .Input("OutSize")
+      .Output("Output")
+      .AddIntsArg("size", {output_height, output_width})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  }

  // Warm-up
  for (int i = 0; i < 5; ++i) {
@@ -58,9 +70,12 @@ static void ResizeBilinearBenchmark(int iters,

 #define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1, TYPE)        \
  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, CPU);  \
-  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, NEON); \
  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, OPENCL);

+// SNPE 835 GPU: 6870us
+BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, half);
+BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, float);
+
 BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15, float);
 BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30, float);
 BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60, float);

--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -23,14 +23,14 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
  // Add input data
  vector<float> input(24);
  std::iota(begin(input), end(input), 0);
-  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 2, 4}, input);
+  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
  net.AddInputFromArray<DeviceType::CPU, int>("OutSize", {2}, {1, 2});

  // Run
  net.RunOp();

  // Check
-  auto expected = CreateTensor<float>({1, 3, 1, 2}, {0, 2, 8, 10, 16, 18});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -49,14 +49,14 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
  // Add input data
  vector<float> input(24);
  std::iota(begin(input), end(input), 0);
-  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 2, 4}, input);
+  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
  net.AddInputFromArray<DeviceType::CPU, int>("OutSize", {2}, {1, 2});

  // Run
  net.RunOp();

  // Check
-  auto expected = CreateTensor<float>({1, 3, 1, 2}, {0, 3, 8, 11, 16, 19});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -65,6 +65,7 @@ template <DeviceType D>
 void TestRandomResizeBilinear() {
  srand(time(nullptr));
  testing::internal::LogToStderr();
+
  for (int round = 0; round < 10; ++round) {
    int batch = 1 + rand() % 5;
    int channels = 1 + rand() % 100;
@@ -72,39 +73,54 @@ void TestRandomResizeBilinear() {
    int width = 1 + rand() % 100;
    int in_height = 1 + rand() % 100;
    int in_width = 1 + rand() % 100;
+    int align_corners = rand() % 1;

    // Construct graph
    OpsTestNet net;
-    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
-        .Input("Input")
-        .Input("OutSize")
-        .Output("Output")
-        .AddIntArg("align_corners", 1)
-        .AddIntsArg("size", {height, width})
-        .Finalize(net.NewOperatorDef());
-
    // Add input data
    net.AddRandomInput<D, float>("Input",
-                                 {batch, channels, in_height, in_width});
+                                 {batch, in_height, in_width, channels});
    net.AddInputFromArray<D, int>("OutSize", {2}, {height, width});

-    // Run
-    net.RunOp(D);
-    Tensor actual;
-    actual.Copy(*net.GetOutput("Output"));
-
+    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
+      .Input("Input")
+      .Input("OutSize")
+      .Output("Output")
+      .AddIntArg("align_corners", align_corners)
+      .AddIntsArg("size", {height, width})
+      .Finalize(net.NewOperatorDef());
    // Run on CPU
    net.RunOp(DeviceType::CPU);
-    Tensor *expected = net.GetOutput("Output");
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    if (D == DeviceType::OPENCL) {
+      BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+
+      OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
+        .Input("InputImage")
+        .Input("OutSize")
+        .Output("OutputImage")
+        .AddIntArg("align_corners", align_corners)
+        .AddIntsArg("size", {height, width})
+        .Finalize(net.NewOperatorDef());
+      // Run
+      net.RunOp(D);

+      ImageToBuffer<D, float>(net, "OutputImage", "DeviceOutput", kernels::BufferType::IN_OUT);
+    } else {
+      // TODO support NEON
+    }
    // Check
-    ExpectTensorNear<float>(*expected, actual, 0.001);
+    ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 0.001);
  }
 }

+/*
 TEST_F(ResizeBilinearTest, NEONRandomResizeBilinear) {
  TestRandomResizeBilinear<DeviceType::NEON>();
 }
+*/

 TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) {
  TestRandomResizeBilinear<DeviceType::OPENCL>();

--- a/mace/python/tools/tf_ops_stats.py
+++ b/mace/python/tools/tf_ops_stats.py
@@ -92,6 +92,7 @@ def main(unused_args):
            size = tensor_values[input_name]
            break
        key = '%s(size=%s, align_corners=%s)' % (op.type, size, align_corners)
+        print(key)
        hist_inc(stats, key)
      elif op.type in ['AvgPool', 'MaxPool']:
        padding = op.get_attr('padding')