merge with master

eef80d7c · yejianwu · b7a95857 · 5c1264b3 · eef80d7c · eef80d7c
85 changed file
--- a/mace/BUILD
+++ b/mace/BUILD
@@ -23,3 +23,11 @@ config_setting(
    },
    visibility = ["//visibility:public"],
 )
+config_setting(
+    name = "is_profiling",
+    define_values = {
+        "profiling": "true",
+    },
+    visibility = ["//visibility:public"],
+)
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -7,7 +7,7 @@ package(
 licenses(["notice"])  # Apache 2.0
-load("//mace:mace.bzl", "if_android")
+load("//mace:mace.bzl", "if_android", "if_profiling")
 cc_library(
    name = "opencl_runtime",
@@ -19,7 +19,7 @@ cc_library(
        "runtime/opencl/cl2.hpp",
        "runtime/opencl/*.h",
    ]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11"] + if_profiling(["-D__ENABLE_PROFILING"]),
    deps = [
        ":logging",
        "@opencl_headers//:opencl20_headers",

--- a/mace/core/half.h
+++ b/mace/core/half.h
@@ -1098,7 +1098,7 @@ namespace half_float
 		/// Conversion constructor.
 		/// \param rhs float to convert
-		explicit half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
+		half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
 		/// Conversion to single-precision.
 		/// \return single precision value representing expression value

--- a/mace/core/opencl_allocator.cc
+++ b/mace/core/opencl_allocator.cc
@@ -13,6 +13,7 @@ namespace {
 static cl_channel_type DataTypeToCLChannelType(const DataType t) {
  switch (t) {
    case DT_HALF:
+      return CL_HALF_FLOAT;
    case DT_FLOAT:
      return CL_FLOAT;
    case DT_INT8:
@@ -53,10 +54,11 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
  cl_int error;
  cl::Image2D *cl_image =
      new cl::Image2D(OpenCLRuntime::Get()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ,
+                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                      img_format,
                      image_shape[0], image_shape[1],
                      0, nullptr, &error);
+  MACE_CHECK(error == CL_SUCCESS);
  return cl_image;
 }

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -6,6 +6,24 @@
 namespace mace {
+OpKeyBuilder::OpKeyBuilder(const char *op_name): op_name_(op_name) {}
+OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name,
+                                           const DataType allowed) {
+  type_constraint_[attr_name] = allowed;
+  return *this;
+}
+const std::string OpKeyBuilder::Build() {
+  static const std::vector<std::string> type_order = {"T"};
+  std::string key = op_name_;
+  for (auto type : type_order) {
+    key += type + "_" + DataTypeToString(type_constraint_[type]);
+  }
+  return key;
+}
 std::map<int32_t, OperatorRegistry *> *gDeviceTypeRegistry() {
  static std::map<int32_t, OperatorRegistry *> g_device_type_registry;
  return &g_device_type_registry;
@@ -33,7 +51,14 @@ unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
                                        Workspace *ws,
                                        DeviceType type) {
  OperatorRegistry *registry = gDeviceTypeRegistry()->at(type);
-  return registry->Create(operator_def.type(), operator_def, ws);
+  const int dtype = ArgumentHelper::GetSingleArgument<OperatorDef, int>(operator_def,
+                                                                        "T",
+                                                                        static_cast<int>(DT_FLOAT));
+  return registry->Create(OpKeyBuilder(operator_def.type().data())
+                              .TypeConstraint("T", static_cast<DataType>(dtype))
+                              .Build(),
+                          operator_def,
+                          ws);
 }
 OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws)

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -134,6 +134,29 @@ struct DeviceTypeRegisterer {
  }
 };
+class OpKeyBuilder {
+ public:
+  explicit OpKeyBuilder(const char *op_name);
+  OpKeyBuilder &TypeConstraint(const char *attr_name, const DataType allowed);
+  template <typename T>
+  OpKeyBuilder &TypeConstraint(const char *attr_name);
+  const std::string Build();
+ private:
+  std::string op_name_;
+  std::map<std::string, DataType> type_constraint_;
+};
+template <typename T>
+OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
+  return this->TypeConstraint(attr_name, DataTypeToEnum<T>::value);
+}
 #define MACE_REGISTER_DEVICE_TYPE(type, registry_function)         \
  namespace {                                                      \
  static DeviceTypeRegisterer MACE_ANONYMOUS_VARIABLE(DeviceType)( \

--- a/mace/core/registry.h
+++ b/mace/core/registry.h
@@ -106,10 +106,10 @@ class Registerer {
  }
 #define MACE_REGISTER_CREATOR(RegistryName, key, ...) \
-  MACE_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
+  MACE_REGISTER_TYPED_CREATOR(RegistryName, key, __VA_ARGS__)
 #define MACE_REGISTER_CLASS(RegistryName, key, ...) \
-  MACE_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
+  MACE_REGISTER_TYPED_CLASS(RegistryName, key, __VA_ARGS__)
 }  // namespace mace

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -79,14 +79,16 @@ OpenCLRuntime *OpenCLRuntime::Get() {
      return;
    }
+    cl_command_queue_properties properties = 0;
+#ifdef __ENABLE_PROFILING
+    enable_profiling_ = true;
+    profiling_ev_.reset(new cl::Event());
+    properties = CL_QUEUE_PROFILING_ENABLE;
+#endif
    // a context is like a "runtime link" to the device and platform;
    // i.e. communication is possible
    cl::Context context({gpu_device});
-    cl_command_queue_properties properties = 0;
-    if (enable_profiling_) {
-      profiling_ev_.reset(new cl::Event());
-      properties = CL_QUEUE_PROFILING_ENABLE;
-    }
    cl::CommandQueue command_queue(context, gpu_device, properties);
    instance = new OpenCLRuntime(context, gpu_device, command_queue);
@@ -104,12 +106,12 @@ cl::Event* OpenCLRuntime::GetDefaultEvent() {
 }
 cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() {
-  MACE_CHECK(enable_profiling_, "should enable profiling first.");
+  MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
  return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
 }
 cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() {
-  MACE_CHECK(enable_profiling_, "should enable profiling first.");
+  MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
  return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
 }
@@ -139,6 +141,7 @@ const std::map<std::string, std::string>
    OpenCLRuntime::program_map_ = {
  {"addn", "addn.cl"},
  {"batch_norm", "batch_norm.cl"},
+  {"conv_2d", "conv_2d.cl"},
  {"conv_2d_1x1", "conv_2d_1x1.cl"},
  {"conv_2d_3x3", "conv_2d_3x3.cl"},
  {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"},

--- a/mace/core/types.cc
+++ b/mace/core/types.cc
@@ -24,6 +24,23 @@ bool DataTypeCanUseMemcpy(DataType dt) {
  }
 }
+std::string DataTypeToString(const DataType dt) {
+  static std::map<DataType, std::string> dtype_string_map = {
+      {DT_FLOAT, "DT_FLOAT"},
+      {DT_HALF, "DT_HALF"},
+      {DT_DOUBLE, "DT_DOUBLE"},
+      {DT_UINT8, "DT_UINT8"},
+      {DT_INT8, "DT_INT8"},
+      {DT_INT32, "DT_INT32"},
+      {DT_UINT32, "DT_UINT32"},
+      {DT_UINT16, "DT_UINT16"},
+      {DT_INT64, "DT_INT64"},
+      {DT_BOOL, "DT_BOOL"},
+      {DT_STRING, "DT_STRING"}
+  };
+  MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type";
+  return dtype_string_map[dt];
+}
 size_t GetEnumTypeSize(const DataType dt) {
  switch (dt) {

--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -18,6 +18,8 @@ bool DataTypeCanUseMemcpy(DataType dt);
 size_t GetEnumTypeSize(const DataType dt);
+std::string DataTypeToString(const DataType dt);
 template <class T>
 struct IsValidDataType;

--- a/mace/dsp/BUILD
+++ b/mace/dsp/BUILD
@@ -24,7 +24,7 @@ cc_library(
        "*.h",
        "hexagon/*.h",
    ]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    deps = [
        "//mace/proto:cc_proto",
        "//mace/core:core",
@@ -36,7 +36,7 @@ cc_test(
    name = "dsp_test",
    testonly = 1,
    srcs = glob(["*_test.cc"]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    linkopts = if_android([
        "-ldl",
        "-lm",
@@ -52,7 +52,7 @@ cc_test(
    name = "dsp_op_test",
    testonly = 1,
    srcs = glob(["test/*_test.cc"]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    linkopts = if_android([
        "-ldl",
        "-lm",
@@ -64,3 +64,21 @@ cc_test(
        "//mace/kernels:kernels",
    ],
 )
+cc_binary(
+    name = "mace_dsp_run",
+    srcs = [
+        "tool/mace_dsp_run.cc",
+    ],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
+    linkopts = if_android([
+        "-ldl",
+        "-lm",
+    ]),
+    linkstatic = 1,
+    deps = [
+        ":dsp",
+        "//mace/kernels:kernels",
+        "//mace/utils:command_line_flags",
+    ],
+)
\ No newline at end of file
--- a/mace/dsp/hexagon_control_wrapper.cc
+++ b/mace/dsp/hexagon_control_wrapper.cc
@@ -111,22 +111,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef& net_def) {
  }
  // input info
-  const InputInfo& input_info = net_def.input_info()[0];
+  num_inputs_ = 0;
-  input_shape_.insert(input_shape_.begin(),
+  for (const InputInfo &input_info: net_def.input_info()) {
-                      input_info.dims().begin(), input_info.dims().end());
+    vector<index_t> input_shape;
-  while (input_shape_.size() < 4) {
+    input_shape.insert(input_shape.begin(),
-    input_shape_.insert(input_shape_.begin(), 1);
+                       input_info.dims().begin(), input_info.dims().end());
+    while (input_shape.size() < 4) {
+      input_shape.insert(input_shape.begin(), 1);
+    }
+    input_shapes_.push_back(input_shape);
+    input_data_types_.push_back(input_info.data_type());
+    num_inputs_ += 1;
  }
-  input_data_type_ = input_info.data_type();
  // output info
-  const OutputInfo& output_info = net_def.output_info()[0];
+  num_outputs_ = 0;
-  output_shape_.insert(output_shape_.begin(),
+  for (const OutputInfo &output_info: net_def.output_info()) {
-                       output_info.dims().begin(), output_info.dims().end());
+    vector<index_t> output_shape;
-  while (output_shape_.size() < 4) {
+    output_shape.insert(output_shape.begin(),
-    output_shape_.insert(output_shape_.begin(), 1);
+                        output_info.dims().begin(), output_info.dims().end());
+    while (output_shape.size() < 4) {
+      output_shape.insert(output_shape.begin(), 1);
+    }
+    output_shapes_.push_back(output_shape);
+    output_data_types_.push_back(output_info.data_type());
+    num_outputs_ += 1;
  }
-  output_data_type_ = output_info.data_type();
  bool res =  hexagon_nn_prepare(nn_id_) == 0;
  return res;
@@ -218,4 +228,111 @@ void HexagonControlWrapper::ResetPerfInfo() {
  hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME);
 }
+bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
+                                         Tensor *output_tensor) {
+  LOG(INFO) << "Execute graph: " << nn_id_;
+  // single input and single output
+  MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num");
+  MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
+  output_tensor->SetDtype(output_data_types_[0]);
+  output_tensor->Resize(output_shapes_[0]);
+  vector<uint32_t> output_shape(4);
+  uint32_t output_bytes;
+  int res = hexagon_nn_execute(nn_id_,
+                               input_tensor.shape()[0],
+                               input_tensor.shape()[1],
+                               input_tensor.shape()[2],
+                               input_tensor.shape()[3],
+                               reinterpret_cast<const unsigned char *>(
+                                   input_tensor.raw_data()),
+                               input_tensor.raw_size(),
+                               &output_shape[0],
+                               &output_shape[1],
+                               &output_shape[2],
+                               &output_shape[3],
+                               reinterpret_cast<unsigned char *>(
+                                   output_tensor->raw_mutable_data()),
+                               output_tensor->raw_size(),
+                               &output_bytes);
+  MACE_ASSERT(output_shape == output_shapes_[0],
+              "wrong output shape inferred");
+  MACE_ASSERT(output_bytes == output_tensor->raw_size(),
+              "wrong output bytes inferred.");
+  return res == 0;
+};
+bool HexagonControlWrapper::ExecuteGraphNew(const vector<Tensor> &input_tensors,
+                                            vector<Tensor> *output_tensors) {
+  LOG(INFO) << "Execute graph new: " << nn_id_;
+  int num_inputs = input_tensors.size();
+  int num_outputs = output_tensors->size();
+  MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
+  MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
+  hexagon_nn_tensordef *inputs = new hexagon_nn_tensordef[num_inputs];
+  hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs];
+  for (int i = 0; i < num_inputs; ++i) {
+    vector<index_t> input_shape = input_tensors[i].shape();
+    inputs[i].batches = input_shape[0];
+    inputs[i].height = input_shape[1];
+    inputs[i].width = input_shape[2];
+    inputs[i].depth = input_shape[3];
+    inputs[i].data = const_cast<unsigned char *>(
+        reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
+    inputs[i].dataLen = input_tensors[i].raw_size();
+    inputs[i].data_valid_len = input_tensors[i].raw_size();
+    inputs[i].unused = 0;
+  }
+  for (int i = 0; i < num_outputs; ++i) {
+    (*output_tensors)[i].SetDtype(output_data_types_[i]);
+    (*output_tensors)[i].Resize(output_shapes_[i]);
+    outputs[i].data = reinterpret_cast<unsigned char *>(
+        (*output_tensors)[i].raw_mutable_data());
+    outputs[i].dataLen = (*output_tensors)[i].raw_size();
+  }
+  int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
+                                   outputs, num_outputs);
+  for (int i = 0; i < num_outputs; ++i) {
+    vector<uint32_t> output_shape {outputs[i].batches, outputs[i].height,
+                                   outputs[i].width, outputs[i].depth};
+    MACE_ASSERT(output_shape  == output_shapes_[i],
+                "wrong output shape inferred");
+    MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(),
+                "wrong output bytes inferred.");
+  }
+  delete [] inputs;
+  delete [] outputs;
+  return res == 0;
+};
+bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
+                                                    Tensor *output_tensor) {
+  vector<Tensor> input_tensors(3);
+  vector<Tensor> output_tensors(3);
+  input_tensors[0].SetDtype(DT_UINT8);
+  output_tensors[0].SetDtype(DT_UINT8);
+  input_tensors[0].ResizeLike(input_tensor);
+  input_tensors[1].Resize({1, 1, 1, 1});
+  float *min_in_data = input_tensors[1].mutable_data<float>();
+  input_tensors[2].Resize({1, 1, 1, 1});
+  float *max_in_data = input_tensors[2].mutable_data<float>();
+  quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data, max_in_data);
+  if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
+    return false;
+  }
+  output_tensor->ResizeLike(output_tensors[0]);
+  const float *min_out_data = output_tensors[1].data<float>();
+  const float *max_out_data = output_tensors[2].data<float>();
+  quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data, output_tensor);
+  return true;
+}
 } // namespace mace
\ No newline at end of file
--- a/mace/dsp/hexagon_control_wrapper.h
+++ b/mace/dsp/hexagon_control_wrapper.h
@@ -7,6 +7,7 @@
 #include "mace/dsp/hexagon/hexagon_controller.h"
 #include "mace/dsp/hexagon_nn_ops.h"
+#include "mace/dsp/util/quantize.h"
 #include "mace/core/common.h"
 #include "mace/core/tensor.h"
 #include "mace/proto/mace.pb.h"
@@ -23,35 +24,10 @@ class HexagonControlWrapper {
  bool Finalize();
  bool SetupGraph(const NetDef& net_def);
  bool SetupGraph(const std::string &model_file);
-  bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor) {
+  bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
-    LOG(INFO) << "Execute graph: " << nn_id_;
+  bool ExecuteGraphNew(const vector<Tensor>& input_tensors,
-    output_tensor->SetDtype(output_data_type_);
+                       vector<Tensor> *output_tensors);
-    output_tensor->Resize(output_shape_);
+  bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor);
-    vector<uint32_t> output_shape(4);
-    uint32_t output_bytes;
-    int res = hexagon_nn_execute(nn_id_,
-                                 input_tensor.shape()[0],
-                                 input_tensor.shape()[1],
-                                 input_tensor.shape()[2],
-                                 input_tensor.shape()[3],
-                                 reinterpret_cast<const unsigned char *>(
-                                     input_tensor.raw_data()),
-                                 input_tensor.raw_size(),
-                                 &output_shape[0],
-                                 &output_shape[1],
-                                 &output_shape[2],
-                                 &output_shape[3],
-                                 reinterpret_cast<unsigned char *>(
-                                     output_tensor->raw_mutable_data()),
-                                 output_tensor->raw_size(),
-                                 &output_bytes);
-    MACE_ASSERT(output_shape == output_shape_,
-                "wrong output shape inferred");
-    MACE_ASSERT(output_bytes == output_tensor->raw_size(),
-                "wrong output bytes inferred.");
-    return res == 0;
-  };
  bool TeardownGraph();
  void PrintLog();
@@ -70,11 +46,14 @@ class HexagonControlWrapper {
  int nn_id_;
  Serializer serializer_;
+  Quantizer quantizer_;
-  vector<index_t> input_shape_;
-  vector<index_t> output_shape_;
+  vector<vector<index_t>> input_shapes_;
-  DataType input_data_type_;
+  vector<vector<index_t>> output_shapes_;
-  DataType output_data_type_;
+  vector<DataType> input_data_types_;
+  vector<DataType> output_data_types_;
+  uint32_t num_inputs_;
+  uint32_t num_outputs_;
 DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
 };

--- a/mace/dsp/hexagon_control_wrapper_test.cc
+++ b/mace/dsp/hexagon_control_wrapper_test.cc
@@ -8,7 +8,7 @@
 using namespace mace;
-TEST(HexagonControlerWrapper, GetVersion) {
+TEST(HexagonControlerWrapper, InputFloat) {
  testing::internal::LogToStderr();
  HexagonControlWrapper wrapper;
  VLOG(0) << "version: " << wrapper.GetVersion();
@@ -29,7 +29,7 @@ TEST(HexagonControlerWrapper, GetVersion) {
  wrapper.ResetPerfInfo();
  timeval tv1, tv2;
  gettimeofday(&tv1, NULL);
-  int round = 2;
+  int round = 10;
  for (int i = 0; i < round; ++i) {
    VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
  }
@@ -49,6 +49,50 @@ TEST(HexagonControlerWrapper, GetVersion) {
  }
  std::cout << std::endl;
+  VLOG(0) << wrapper.TeardownGraph();
+  wrapper.Finalize();
+}
+TEST(HexagonControlerWrapper, PreQuantize) {
+  testing::internal::LogToStderr();
+  HexagonControlWrapper wrapper;
+  VLOG(0) << "version: " << wrapper.GetVersion();
+  wrapper.Init();
+  wrapper.SetDebugLevel(0);
+  wrapper.Config();
+  VLOG(0) << wrapper.SetupGraph("quantized_icnet_dsp_u8.pb");
+  wrapper.PrintGraph();
+  Tensor input_tensor;
+  Tensor output_tensor;
+  input_tensor.Resize({1, 480, 480, 3});
+  float *input_data = input_tensor.mutable_data<float>();
+  for (int i = 0; i < input_tensor.size(); ++i) {
+    input_data[i] = i % 256;
+  }
+  wrapper.ResetPerfInfo();
+  timeval tv1, tv2;
+  gettimeofday(&tv1, NULL);
+  int round = 10;
+  for (int i = 0; i < round; ++i) {
+    VLOG(0) << wrapper.ExecuteGraphPreQuantize(input_tensor, &output_tensor);
+  }
+  gettimeofday(&tv2, NULL);
+  VLOG(0) << "avg duration: "
+          << ((tv2.tv_sec - tv1.tv_sec) * 1000 +
+              (tv2.tv_usec - tv1.tv_usec) / 1000) /
+              round;
+  wrapper.GetPerfInfo();
+  wrapper.PrintLog();
+  const float *output_data = output_tensor.data<float>();
+  for (int i = 0; i < output_tensor.size(); ++i) {
+    std::cout << output_data[i] << " ";
+  }
+  std::cout << std::endl;
  VLOG(0) << wrapper.TeardownGraph();
  wrapper.Finalize();
 }
\ No newline at end of file
--- a/mace/dsp/test/quantized_resize_bilinear_test.cc
+++ b/mace/dsp/test/quantized_resize_bilinear_test.cc
@@ -5,6 +5,7 @@
 #include "mace/dsp/hexagon_control_wrapper.h"
 #include "gtest/gtest.h"
+#define RESIZE_BILINEAR_TEST_CHANNELS 128
 using namespace mace;
 static NetDef BuildNetDef() {
@@ -17,7 +18,7 @@ static NetDef BuildNetDef() {
  input_op->set_type("INPUT");
  input_op->set_node_id(0);
  input_op->set_padding(0);
-  input_op->add_out_max_byte_size(1000);
+  input_op->add_out_max_byte_size(1200);
  // relu op
  OperatorDef *resize_bilinear_op = net.add_op();
@@ -45,7 +46,7 @@ static NetDef BuildNetDef() {
  input_node_input = resize_bilinear_op->add_node_input();
  input_node_input->set_node_id(12);
  input_node_input->set_output_port(0);
-  resize_bilinear_op->add_out_max_byte_size(1000);
+  resize_bilinear_op->add_out_max_byte_size(1200);
  resize_bilinear_op->add_out_max_byte_size(1000);
  resize_bilinear_op->add_out_max_byte_size(1000);
@@ -64,8 +65,8 @@ static NetDef BuildNetDef() {
  new_dim_tensor->add_dims(2);
  new_dim_tensor->set_data_type(DataType::DT_INT32);
  new_dim_tensor->set_node_id(10);
-  new_dim_tensor->add_int32_data(1);
+  new_dim_tensor->add_int32_data(2);
-  new_dim_tensor->add_int32_data(1);
+  new_dim_tensor->add_int32_data(2);
  TensorProto *input_min_tensor = net.add_tensors();
  input_min_tensor->set_name("input_min");
@@ -86,20 +87,20 @@ static NetDef BuildNetDef() {
  input_info->set_name("input_node");
  input_info->set_node_id(0);
  input_info->add_dims(1);
-  input_info->add_dims(2);
+  input_info->add_dims(3);
-  input_info->add_dims(2);
+  input_info->add_dims(3);
-  input_info->add_dims(128);
+  input_info->add_dims(RESIZE_BILINEAR_TEST_CHANNELS);
  input_info->set_data_type(DataType::DT_UINT8);
-  input_info->set_max_byte_size(1000);
+  input_info->set_max_byte_size(1200);
  OutputInfo *output_info = net.add_output_info();
  output_info->set_name("output_node");
  output_info->set_node_id(1);
  output_info->add_dims(1);
-  output_info->add_dims(1);
+  output_info->add_dims(2);
-  output_info->add_dims(1);
+  output_info->add_dims(2);
-  output_info->add_dims(128);
+  output_info->add_dims(RESIZE_BILINEAR_TEST_CHANNELS);
  output_info->set_data_type(DataType::DT_UINT8);
-  output_info->set_max_byte_size(1000);
+  output_info->set_max_byte_size(1200);
  return net;
 }
@@ -117,21 +118,25 @@ TEST(QuantizedResizeBilinearTest, QuantizedResizeBilinear) {
  Allocator *cpu_allocator = GetDeviceAllocator(DeviceType::CPU);
  Tensor input_tensor(cpu_allocator, DT_UINT8);
  Tensor output_tensor(cpu_allocator, DT_UINT8);
-  input_tensor.Resize({1, 2, 2, 128});
+  input_tensor.Resize({1, 3, 3, RESIZE_BILINEAR_TEST_CHANNELS});
-  output_tensor.Resize({1, 1, 1, 128});
+  output_tensor.Resize({1, 2, 2, RESIZE_BILINEAR_TEST_CHANNELS});
  uint8_t *input_data = input_tensor.mutable_data<uint8_t>();
  const uint8_t *output_data = output_tensor.data<uint8_t>();
-  for (int c = 0; c < 128; ++c) {
+  for (int wh = 0; wh < 9; ++wh) {
-    input_data[c] = input_data[c + 128] = input_data[c + 256]
+    for (int c = 0; c < RESIZE_BILINEAR_TEST_CHANNELS; ++c) {
-        = input_data[c + 384] = (uint8_t)c;
+      input_data[wh * RESIZE_BILINEAR_TEST_CHANNELS + c] = 9 - wh;
+    }
  }
  VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
  wrapper.PrintLog();
-  for (int i = 0; i < output_tensor.size(); ++i) {
+  vector<uint8_t> expected {9, 8, 5, 3};
-    EXPECT_EQ(i, output_data[i]);
+  for (int i = 0; i < 4; ++i) {
+    for (int c = 0; c < RESIZE_BILINEAR_TEST_CHANNELS; ++c)
+      EXPECT_EQ(expected[i],
+                output_data[i * RESIZE_BILINEAR_TEST_CHANNELS + c]);
  }
  std::cout << std::endl;

--- a/mace/dsp/tool/mace_dsp_run.cc
+++ b/mace/dsp/tool/mace_dsp_run.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+/**
+ * Usage:
+ * mace_dsp_run --model=mobi_mace.pb \
+ *          --input_shape=1,3,224,224   \
+ *          --input_file=input_data \
+ *          --output_file=mace.out
+ */
+#include <sys/time.h>
+#include <fstream>
+#include "mace/dsp/hexagon_control_wrapper.h"
+#include "mace/core/net.h"
+#include "mace/utils/command_line_flags.h"
+using namespace std;
+using namespace mace;
+void ParseShape(const string &str, vector<index_t> *shape) {
+  string tmp = str;
+  while (!tmp.empty()) {
+    int dim = atoi(tmp.data());
+    shape->push_back(dim);
+    size_t next_offset = tmp.find(",");
+    if (next_offset == string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+int main(int argc, char **argv) {
+  string model_file;
+  string input_shape;
+  string input_file;
+  string output_file;
+  int round = 1;
+  std::vector<Flag> flag_list = {
+      Flag("model", &model_file, "model file name"),
+      Flag("input_shape", &input_shape, "input shape, separated by comma"),
+      Flag("input_file", &input_file, "input file name"),
+      Flag("output_file", &output_file, "output file name"),
+      Flag("round", &round, "round"),
+  };
+  string usage = Flags::Usage(argv[0], flag_list);
+  const bool parse_result = Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << usage;
+    return -1;
+  }
+  VLOG(0) << "model: " << model_file << std::endl
+          << "input_shape: " << input_shape << std::endl
+          << "input_file: " << input_file << std::endl
+          << "output_file: " << output_file << std::endl
+          << "round: " << round << std::endl;
+  vector<index_t> shape;
+  ParseShape(input_shape, &shape);
+  // load input
+  Tensor input_tensor;
+  input_tensor.Resize(shape);
+  float *input_data = input_tensor.mutable_data<float>();
+  ifstream in_file(input_file, ios::in | ios::binary);
+  in_file.read(reinterpret_cast<char *>(input_data),
+               input_tensor.size() * sizeof(float));
+  in_file.close();
+  // execute
+  HexagonControlWrapper wrapper;
+  VLOG(0) << "version: " << wrapper.GetVersion();
+  wrapper.Init();
+  wrapper.SetDebugLevel(0);
+  wrapper.Config();
+  VLOG(0) << wrapper.SetupGraph(model_file);
+  wrapper.PrintGraph();
+  Tensor output_tensor;
+  timeval tv1, tv2;
+  gettimeofday(&tv1, NULL);
+  for (int i = 0; i < round; ++i) {
+    VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
+  }
+  gettimeofday(&tv2, NULL);
+  cout << "avg duration: "
+       << ((tv2.tv_sec - tv1.tv_sec) * 1000 +
+           (tv2.tv_usec - tv1.tv_usec) / 1000) /
+           round
+       << endl;
+  wrapper.GetPerfInfo();
+  wrapper.PrintLog();
+  VLOG(0) << wrapper.TeardownGraph();
+  wrapper.Finalize();
+  // save output
+  ofstream out_file(output_file, ios::binary);
+  out_file.write((const char *) (output_tensor.data<float>()),
+                 output_tensor.size() * sizeof(float));
+  out_file.flush();
+  out_file.close();
+}
\ No newline at end of file
--- a/mace/dsp/util/BUILD
+++ b/mace/dsp/util/BUILD
@@ -20,7 +20,7 @@ cc_library(
    hdrs = glob([
        "*.h",
    ]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    deps = [
        "//mace/core:core",
    ],

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -10,15 +10,23 @@
 namespace mace {
 namespace kernels {
-template<DeviceType D, typename T>
+struct AddNFunctorBase {};
-struct AddNFunctor {
-  void operator()(std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
+template <DeviceType D, typename T>
+struct AddNFunctor : AddNFunctorBase {
+  void operator()(const std::vector<const Tensor *> &input_tensors,
+                  Tensor *output_tensor) {
+    output_tensor->ResizeLike(input_tensors[0]);
    Tensor::MappingGuard output_map(output_tensor);
    index_t size = input_tensors[0]->size();
    T *output_ptr = output_tensor->mutable_data<T>();
    memset(output_ptr, 0, size * sizeof(T));
    int n = input_tensors.size();
    for (int i = 0; i < n; ++i) {
+      MACE_CHECK(input_tensors[i]->dim(0) == output_tensor->dim(0));
+      MACE_CHECK(input_tensors[i]->dim(1) == output_tensor->dim(1));
+      MACE_CHECK(input_tensors[i]->dim(2) == output_tensor->dim(2));
+      MACE_CHECK(input_tensors[i]->dim(3) == output_tensor->dim(3));
      Tensor::MappingGuard input_map(input_tensors[i]);
      const T *input_ptr = input_tensors[i]->data<T>();
      for (index_t j = 0; j < size; ++j) {
@@ -28,15 +36,17 @@ struct AddNFunctor {
  }
 };
-template<>
+template <>
 void AddNFunctor<DeviceType::NEON, float>::operator()(
-    std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
-template<>
+template <typename T>
-void AddNFunctor<DeviceType::OPENCL, float>::operator()(
+struct AddNFunctor<DeviceType::OPENCL, T> : AddNFunctorBase {
-    std::vector<const Tensor *> &inputs, Tensor *output);
+  void operator()(const std::vector<const Tensor *> &input_tensors,
+                  Tensor *output_tensor);
+};
 }  //  namespace kernels
 }  //  namespace mace
 #endif  // MACE_KERNELS_ADDN_H_
\ No newline at end of file
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -28,9 +28,10 @@ struct BatchNormFunctor {
    // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
    // new_offset = \offset - mean * common_val;
    // Y = new_scale * X + new_offset;
-    const index_t n = input->dim(0);
+    const index_t batch = input->dim(0);
-    const index_t channel = input->dim(1);
+    const index_t height = input->dim(1);
-    const index_t sample_size = input->dim(2) * input->dim(3);
+    const index_t width = input->dim(2);
+    const index_t channels = input->dim(3);
    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard scale_mapper(scale);
@@ -48,19 +49,26 @@ struct BatchNormFunctor {
    const T *epsilon_ptr = epsilon->data<T>();
    T *output_ptr = output->mutable_data<T>();
+    vector<T> new_scale(channels);
+    vector<T> new_offset(channels);
 #pragma omp parallel for
-    for (index_t c = 0; c < channel; ++c) {
+    for (index_t c = 0; c < channels; ++c) {
-      T new_scale = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr);
+      new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr);
-      T new_offset = offset_ptr[c] - mean_ptr[c] * new_scale;
+      new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
-      index_t pos = c * sample_size;
+    }
+    index_t pos = 0;
-      for (index_t i = 0; i < n; ++i) {
+#pragma omp parallel for
-        const T *input_sample_ptr = input_ptr + pos;
+    for (index_t n = 0; n < batch; ++n) {
-        T *output_sample_ptr = output_ptr + pos;
+      for (index_t h = 0; h < height; ++h) {
-        for (index_t j = 0; j < sample_size; ++j) {
+        for (index_t w = 0; w < width; ++w) {
-          output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset;
+          for (index_t c = 0; c < channels; ++c) {
+            output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
+            ++pos;
+          }
        }
-        pos += channel * sample_size;
      }
    }
  }
@@ -76,15 +84,16 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *epsilon,
    Tensor *output);
-template <>
+template <typename T>
-void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
+struct BatchNormFunctor<DeviceType::OPENCL, T> {
-      const Tensor *input,
+  void operator()(const Tensor *input,
-      const Tensor *scale,
+                  const Tensor *scale,
-      const Tensor *offset,
+                  const Tensor *offset,
-      const Tensor *mean,
+                  const Tensor *mean,
-      const Tensor *var,
+                  const Tensor *var,
-      const Tensor *epsilon,
+                  const Tensor *epsilon,
-      Tensor *output);
+                  Tensor *output);
+};
 }  //  namepsace kernels
 }  //  namespace mace

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -11,13 +11,23 @@
 namespace mace {
 namespace kernels {
+struct Conv2dFunctorBase {
+  Conv2dFunctorBase(const int *strides,
+                    const Padding &paddings,
+                    const int *dilations)
+      : strides_(strides), dilations_(dilations), paddings_(paddings) {}
+  const int *strides_;         // [stride_h, stride_w]
+  const int *dilations_;       // [dilation_h, dilation_w]
+  Padding paddings_;
+};
 template<DeviceType D, typename T>
-struct Conv2dFunctor {
+struct Conv2dFunctor : Conv2dFunctorBase {
-  Conv2dFunctor() {}
  Conv2dFunctor(const int *strides,
                const Padding &paddings,
                const int *dilations)
-      : strides_(strides), dilations_(dilations), paddings_(paddings) {}
+      : Conv2dFunctorBase(strides, paddings, dilations) {}
  void operator()(const Tensor *input,
                  const Tensor *filter,
@@ -76,9 +86,10 @@ struct Conv2dFunctor {
      for (int h = 0; h < height; ++h) {
        for (int w = 0; w < width; ++w) {
          for (int c = 0; c < channels; ++c) {
-            T bias_channel = bias_data ? bias_data[c] : 0;
+            T bias_channel = 0.0f;
+            if (bias) bias_channel = bias_data[c];
            *output_data = bias_channel;
-            T sum = 0;
+            T sum = 0.0f;
            const T *filter_ptr = filter_data + c;
            for (int kh = 0; kh < kernel_h; ++kh) {
              for (int kw = 0; kw < kernel_w; ++kw) {
@@ -113,9 +124,6 @@ struct Conv2dFunctor {
  }
-  const int *strides_;         // [stride_h, stride_w]
-  const int *dilations_;       // [dilation_h, dilation_w]
-  Padding paddings_;
 };
 template<>
@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                                                        const Tensor *filter,
                                                        const Tensor *bias,
                                                        Tensor *output);
-template<>
-void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
+template<typename T>
-                                                          const Tensor *filter,
+struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
-                                                          const Tensor *bias,
+  Conv2dFunctor(const int *strides,
-                                                          Tensor *output);
+                const Padding &paddings,
+                const int *dilations)
+      : Conv2dFunctorBase(strides, paddings, dilations) {}
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  Tensor *output);
+};
 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/fused_conv_2d.h
+++ b/mace/kernels/fused_conv_2d.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#ifndef MACE_KERNELS_FUSED_CONV_2D_H_
+#define MACE_KERNELS_FUSED_CONV_2D_H_
+#include "mace/core/tensor.h"
+#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/kernels/conv_2d.h"
+namespace mace {
+namespace kernels {
+struct FusedConv2dFunctorBase {
+  FusedConv2dFunctorBase(const int *strides,
+                         const Padding &paddings,
+                         const int *dilations)
+      : strides_(strides), dilations_(dilations), paddings_(paddings) {}
+  const int *strides_;         // [stride_h, stride_w]
+  const int *dilations_;       // [dilation_h, dilation_w]
+  Padding paddings_;
+};
+template<DeviceType D, typename T>
+struct FusedConv2dFunctor : FusedConv2dFunctorBase {
+  FusedConv2dFunctor(const int *strides,
+                     const Padding &paddings,
+                     const int *dilations)
+      : FusedConv2dFunctorBase(strides, paddings, dilations) {}
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  Tensor *output) {
+    Conv2dFunctor<D, T>(strides_, paddings_, dilations_)(input, filter, bias, output);
+    T *output_data = output->mutable_data<T>();
+    T zero_value;
+    if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
+      zero_value = half_float::half_cast<half>(0.0f);
+    } else {
+      zero_value = 0;
+    }
+    auto output_size = output->size();
+    for (int n = 0; n < output_size; ++n) {
+      *output_data = *output_data < 0 ? zero_value : *output_data;
+      output_data++;
+    }
+  }
+};
+template<typename T>
+struct FusedConv2dFunctor<DeviceType::OPENCL, T> : FusedConv2dFunctorBase {
+  FusedConv2dFunctor(const int *strides,
+                     const Padding &paddings,
+                     const int *dilations)
+      : FusedConv2dFunctorBase(strides, paddings, dilations) {}
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  Tensor *output);
+};
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_FUSED_CONV_2D_H_
--- a/mace/kernels/neon/addn_neon.cc
+++ b/mace/kernels/neon/addn_neon.cc
@@ -10,7 +10,7 @@ namespace kernels {
 template <>
 void AddNFunctor<DeviceType::NEON, float>::operator()(
-    std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
  // TODO: neon mem copy
  index_t size = output_tensor->size();
  float *output_ptr = output_tensor->mutable_data<float>();
@@ -51,4 +51,4 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
 };
 }  // namespace kernels
 }  // namespace mace
\ No newline at end of file
--- a/mace/kernels/neon/pooling_neon.cc
+++ b/mace/kernels/neon/pooling_neon.cc
@@ -58,19 +58,27 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *input_tensor,
    Tensor *output_tensor) {
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  std::vector<index_t> filter_shape(4);
+  filter_shape[0] = input_tensor->shape()[1];
+  filter_shape[1] = input_tensor->shape()[1];
+  filter_shape[2] = kernels_[0];
+  filter_shape[3] = kernels_[1];
+  kernels::CalcPaddingAndOutputSize(
+      input_tensor->shape().data(), filter_shape.data(), this->dilations_,
+      strides_, this->padding_, output_shape.data(),
+      paddings.data());
+  output_tensor->Resize(output_shape);
  const float *input = input_tensor->data<float>();
  float *output = output_tensor->mutable_data<float>();
  const index_t *input_shape = input_tensor->shape().data();
-  const index_t *output_shape = output_tensor->shape().data();
-  int paddings[2];
-  std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
-                                       kernels_[0], kernels_[1]};
-  kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
-                          strides_, this->padding_, paddings);
 #ifdef __COPY_MAKE_PADDING
  Tensor padded_input;
-  ConstructInputWithPadding(input_tensor, paddings, &padded_input);
+  ConstructInputWithPadding(input_tensor, paddings.data(), &padded_input);
  input = padded_input.data<float>();
  input_shape = padded_input.shape().data();
 #endif
@@ -80,17 +88,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
    // kernel_size: 2x2, strides: 2x2
    if (pooling_type_ == MAX) {  // MAX_POOL_2x2s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
+      PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape,
+      PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
-                             paddings);
+                             paddings.data());
 #endif
    } else {  // AVG_POOL_2x2s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
+      PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape,
+      PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
-                             paddings);
+                             paddings.data());
 #endif
    }
  } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 &&
@@ -98,17 +106,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
    // kernel_size: 3x3, strides: 2x2
    if (pooling_type_ == MAX) {  // MAX_POOL_3x3s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
+      PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape,
+      PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
-                             paddings);
+                             paddings.data());
 #endif
    } else {  // AVG_POOL_3x3s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
+      PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape,
+      PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
-                             paddings);
+                             paddings.data());
 #endif
    }
  } else {  // not implement yet

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -5,52 +5,83 @@
 #include "mace/kernels/addn.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
+template <typename T>
-  index_t element_size = input0->NumElements();
+static void AddN(const std::vector<const Tensor *> &input_tensors,
-  index_t blocks = (element_size + 3) / 4;
+                 Tensor *output) {
+  if (input_tensors.size() > 4) {
+    MACE_NOT_IMPLEMENTED;
+  }
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
-  const uint32_t gws = blocks;
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_pixels = channel_blocks * width;
+  const index_t batch_height_pixels = batch * height;
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(output->dtype()));
+  auto dt = DataTypeToEnum<T>::value;
-  auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options);
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
+  auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options);
  const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
  uint32_t idx = 0;
-  addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input0->buffer())));
+  for (auto input : input_tensors) {
-  addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input1->buffer())));
+  addn_kernel.setArg(idx++,
-  addn_kernel.setArg(idx++, static_cast<int32_t>(element_size));
+                     *(static_cast<const cl::Image2D *>(input->buffer())));
-  addn_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  }
+  addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      addn_kernel, cl::NullRange,
-      cl::NDRange(gws),
+      cl::NDRange(width_pixels, batch_height_pixels),
-      cl::NDRange(lws),
+      cl::NDRange(64, 16),  // TODO fix this
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
+  MACE_CHECK(error == CL_SUCCESS) << "error code: " << error;
 }
-template<>
+template <typename T>
-void AddNFunctor<DeviceType::OPENCL, float>::operator()(std::vector<const Tensor *> &input_tensors,
+void AddNFunctor<DeviceType::OPENCL, T>::operator()(
-                                                        Tensor *output_tensor) {
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
-  if (input_tensors.empty() || input_tensors.front() == nullptr) {
-    return;
-  }
  size_t size = input_tensors.size();
+  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
+  const index_t batch = input_tensors[0]->dim(0);
+  const index_t height = input_tensors[0]->dim(1);
+  const index_t width = input_tensors[0]->dim(2);
+  const index_t channels = input_tensors[0]->dim(3);
-  switch (size) {
+  for (int i = 1; i < size; ++i) {
-    case 2:Add2(input_tensors[0], input_tensors[1], output_tensor);
+    MACE_CHECK_NOTNULL(input_tensors[i]);
-      break;
+    MACE_CHECK(batch == input_tensors[i]->dim(0));
-    default:MACE_NOT_IMPLEMENTED;
+    MACE_CHECK(height == input_tensors[i]->dim(1));
+    MACE_CHECK(width == input_tensors[i]->dim(2));
+    MACE_CHECK(channels == input_tensors[i]->dim(3));
  }
+  std::vector<index_t> output_shape = input_tensors[0]->shape();
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+  output_tensor->ResizeImage(output_shape, output_image_shape);
+  AddN<T>(input_tensors, output_tensor);
 };
+template
+struct AddNFunctor<DeviceType::OPENCL, float>;
+template
+struct AddNFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
-} //  namespace mace
+}  //  namespace mace
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -11,8 +11,8 @@
 namespace mace {
 namespace kernels {
-template <>
+template <typename T>
-void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
+void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *input,
    const Tensor *scale,
    const Tensor *offset,
@@ -21,35 +21,39 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
    const Tensor *epsilon,
    Tensor *output) {
-  index_t pixel_size = input->dim(2) * input->dim(3);
+  const index_t batch = input->dim(0);
-  index_t blocks = (pixel_size + 3) / 4;
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
-  const uint32_t gws[3] = {static_cast<uint32_t>(input->dim(0)),
+  const index_t channel_blocks = RoundUpDiv4(channels);
-                           static_cast<uint32_t>(input->dim(1)),
-                           static_cast<uint32_t>(blocks)};
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto dt = DataTypeToEnum<T>::value;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
  auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
-  const std::vector<uint32_t> lws = {1, 1, kwg_size};
+  const std::vector<uint32_t> lws = {1, kwg_size, 1};
  uint32_t idx = 0;
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(scale->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(scale->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(offset->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(offset->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(mean->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(mean->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(var->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(var->buffer())));
  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(epsilon->buffer())));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(pixel_size));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-  bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr);
-  bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr);
  auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
-    return {{1, 1, 64},
+    return {{8, 128, 1}, //SNPE size
+            {1, 1, 64},
            {1, 1, 128},
            {1, kwg_size/16, 16},
            {1, kwg_size/32, 32},
@@ -80,5 +84,9 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
                                                     func);
 }
+template
+struct BatchNormFunctor<DeviceType::OPENCL, float>;
+template
+struct BatchNormFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -24,8 +24,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
  }
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype()));
+  if (buffer->dtype() == image->dtype()) {
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(image->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
+  } else {
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+  }
  auto runtime = OpenCLRuntime::Get();
  string kernel_name;
  switch (type) {

--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
 #include <common.h>
-// Supported data type: half/float
+__kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
-__kernel void add2(__global const DATA_TYPE *input0,
+                   __read_only image2d_t input1,
-                   __global const DATA_TYPE *input1,
+#if INPUT_NUM > 2
-                   __private const int size,
+                   __read_only image2d_t input2,
-                   __global DATA_TYPE *output) {
+#endif
-  int idx = get_global_id(0);
+#if INPUT_NUM > 3
+                   __read_only image2d_t input3,
+#endif
+                   __write_only image2d_t output) {
+  const int w = get_global_id(0);
+  const int hb = get_global_id(1);
-  if (idx + 4 > size) {
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-    for(; idx < size; ++idx) {
-      *(output+idx) = *(input0+idx) + *(input1+idx);
+  DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb));
-    }
+  DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb));
-  } else {
+  DATA_TYPE4 out = in0 + in1;
-    VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0);
-    VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1);
+#if INPUT_NUM > 2
-    vstore4(in_data0+in_data1, idx, output);
+  DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb));
-  }
+  out = out + in2;
+#endif
+#if INPUT_NUM > 3
+  DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb));
+  out = out + in3;
+#endif
+  WRITE_IMAGET(output, (int2)(w, hb), out);
 }
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
 #include <common.h>
 // Supported data types: half/float
-void kernel batch_norm(global const DATA_TYPE *input,
+__kernel void batch_norm(__read_only image2d_t input,
-                       global const DATA_TYPE *scale,
+                         __read_only image2d_t scale,
-                       global const DATA_TYPE *offset,
+                         __read_only image2d_t offset,
-                       global const DATA_TYPE *mean,
+                         __read_only image2d_t mean,
-                       global const DATA_TYPE *var,
+                         __read_only image2d_t var,
-                       global const DATA_TYPE *epsilon,
+                         __global const DATA_TYPE *epsilon,
-                       private const int pixels,
+                         __write_only image2d_t output) {
-                       global DATA_TYPE *output,
+  const int ch_blk = get_global_id(0);
-                       __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_scale,
+  const int w = get_global_id(1);
-                       __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_offset) {
+  const int hb = get_global_id(2);
-  const int batch = get_global_id(0);
+  const int width = get_global_size(1);
-  const int channel = get_global_id(1);
-  const int channels = get_global_size(1);
-  const int pixel_offset = get_global_id(2);
-  const int local_channel = get_local_id(1);
-  const int local_pixel_idx = get_local_id(2);
-  if(local_pixel_idx == 0) {
+  DATA_TYPE4 scale_value = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
-    new_scale[local_channel] = (float4)(scale[channel] * rsqrt(var[channel] + *epsilon));
+  DATA_TYPE4 offset_value = READ_IMAGET(offset, SAMPLER, (int2)(ch_blk, 0));
-    new_offset[local_channel] = (float4)(offset[channel] - mean[channel] * new_scale[local_channel].x);
+  DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0));
-  }
+  DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0));
-  barrier(CLK_LOCAL_MEM_FENCE);
+  DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)(*epsilon));
+  DATA_TYPE4 new_offset = offset_value - mean_value * new_scale;
-  const int image_offset = (batch * channels + channel) * pixels + pixel_offset*4;
+  const int pos = ch_blk * width + w;
-  const DATA_TYPE *input_ptr = input + image_offset;
-  DATA_TYPE *output_ptr = output + image_offset;
-  const int end = (batch * channels + channel + 1) * pixels;
-  if ((image_offset+4) > end) {
-    for (int i = image_offset; i < end; ++i) {
-      *output_ptr = new_scale[local_channel].x * *input_ptr + new_offset[local_channel].x;
-      ++input_ptr;
-      ++output_ptr;
-    }
-  } else {
-    VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input_ptr);
-    values = values * new_scale[local_channel] + new_offset[local_channel];
-    vstore4(values, 0, output_ptr);
-  }
-}
+  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
+  DATA_TYPE4 out = in * new_scale + new_offset;
+  WRITE_IMAGET(output, (int2)(pos, hb), out);
+}
--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -14,4 +14,11 @@
 #define CMD_TYPE_STR(cmd, type) cmd##type
 #define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
+#define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4)
+#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
+#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
+__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 #endif  // MACE_KERNELS_OPENCL_CL_COMMON_H_
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
+#include <common.h>
+__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+                      __read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */
+#ifdef BIAS
+    __read_only image2d_t bias, /* cout%4 * cout/4 */
+#endif
+                      __write_only image2d_t output,
+                      __private const int in_height,
+                      __private const int in_width,
+                      __private const int in_ch_blks,
+                      __private const int out_height,
+                      __private const int out_width,
+                      __private const int filter_height,
+                      __private const int filter_width,
+                      __private const int padding_top,
+                      __private const int padding_left) {
+  const int out_ch_blk = get_global_id(0);
+  const int out_w_blk = get_global_id(1);
+  const int out_w_blks = get_global_size(1);
+  const int out_hb = get_global_id(2);
+  const int rounded_in_ch = in_ch_blks * 4;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+#ifdef BIAS
+  DATA_TYPE4 out0 =
+     READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+  DATA_TYPE4 out1 = out0;
+  DATA_TYPE4 out2 = out0;
+  DATA_TYPE4 out3 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+#endif
+#if STRIDE == 1
+  int in_width0 = out_w_blk - padding_left;
+  int in_width1 = in_width0 + out_w_blks;
+  int in_width2 = in_width1 + out_w_blks;
+  int in_width3 = in_width2 + out_w_blks;
+  const int height_idx = (out_hb % out_height) - padding_top;
+#else
+  int in_width0 = out_w_blk * 2 - padding_left;
+  int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
+  int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
+  int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
+  const int height_idx = (out_hb % out_height) * 2 - padding_top;
+#endif
+  const int batch_idx = (out_hb / out_height) * in_height;
+  DATA_TYPE4 in0, in1, in2, in3;
+  DATA_TYPE4 weights0, weights1, weights2, weights3;
+  int in_idx, in_width_idx;
+  // Unrolling this loop hurt perfmance
+  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
+      int in_hb_value = height_idx + hb_idx;
+      in_hb_value = select(in_hb_value + batch_idx,
+                           -1,
+                           (in_hb_value < 0 || in_hb_value >= in_height));
+      for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
+        in_idx = in_ch_blk * in_width;
+        int in_width_value;
+#define READ_INPUT(i)                                                                \
+        in_width_value = in_width##i + width_idx;                                    \
+        in_width_value = select(in_idx + in_width_value,                             \
+                                -1,                                                  \
+                                (in_width_value < 0 || in_width_value >= in_width)); \
+        in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
+        READ_INPUT(0);
+        READ_INPUT(1);
+        READ_INPUT(2);
+        READ_INPUT(3);
+#undef READ_INPUT
+        int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch;
+        weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
+        weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
+        weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
+        weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+        // Will prefetch L2 improve performance? How to pretch image data?
+        // Interleaving load and mul does not improve performance as expected
+        out0 += in0.x * weights0;
+        out0 += in0.y * weights1;
+        out0 += in0.z * weights2;
+        out0 += in0.w * weights3;
+        out1 += in1.x * weights0;
+        out1 += in1.y * weights1;
+        out1 += in1.z * weights2;
+        out1 += in1.w * weights3;
+        out2 += in2.x * weights0;
+        out2 += in2.y * weights1;
+        out2 += in2.z * weights2;
+        out2 += in2.w * weights3;
+        out3 += in3.x * weights0;
+        out3 += in3.y * weights1;
+        out3 += in3.z * weights2;
+        out3 += in3.w * weights3;
+      }
+    }
+  }
+#ifdef FUSED_RELU
+  // TODO relux
+  out0 = fmax(out0, 0);
+  out1 = fmax(out1, 0);
+  out2 = fmax(out2, 0);
+  out3 = fmax(out3, 0);
+#endif
+  const int out_x_base = out_ch_blk * out_width;
+  int w = out_w_blk;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out0);
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out1);
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out2);
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out3);
+}
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
 #include <common.h>
-#define vec_conv_2d_1x1_s1                    \
-  VEC_DATA_TYPE(DATA_TYPE,4) in0 = vload4(0, input_ptr);                   \
-  VEC_DATA_TYPE(DATA_TYPE,4) in1 = vload4(0, input_ptr + in_pixel);        \
-  VEC_DATA_TYPE(DATA_TYPE,4) in2 = vload4(0, input_ptr + 2 * in_pixel);    \
-  VEC_DATA_TYPE(DATA_TYPE,4) in3 = vload4(0, input_ptr + 3 * in_pixel);
-#define vec_conv_2d_1x1_s2                    \
-  VEC_DATA_TYPE(DATA_TYPE,4) in00 = vload4(0, input_ptr);                   \
-  VEC_DATA_TYPE(DATA_TYPE,3) in01 = vload3(0, input_ptr + 4);               \
-  VEC_DATA_TYPE(DATA_TYPE,4) in10 = vload4(0, input_ptr + in_pixel);        \
-  VEC_DATA_TYPE(DATA_TYPE,3) in11 = vload3(0, input_ptr + in_pixel + 4);    \
-  VEC_DATA_TYPE(DATA_TYPE,4) in20 = vload4(0, input_ptr + 2 * in_pixel);    \
-  VEC_DATA_TYPE(DATA_TYPE,3) in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\
-  VEC_DATA_TYPE(DATA_TYPE,4) in30 = vload4(0, input_ptr + 3 * in_pixel);    \
-  VEC_DATA_TYPE(DATA_TYPE,3) in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \
-  VEC_DATA_TYPE(DATA_TYPE,4) in0 = (VEC_DATA_TYPE(DATA_TYPE,4))(in00.s02, in01.s02);            \
-  VEC_DATA_TYPE(DATA_TYPE,4) in1 = (VEC_DATA_TYPE(DATA_TYPE,4))(in10.s02, in11.s02);            \
-  VEC_DATA_TYPE(DATA_TYPE,4) in2 = (VEC_DATA_TYPE(DATA_TYPE,4))(in20.s02, in21.s02);            \
-  VEC_DATA_TYPE(DATA_TYPE,4) in3 = (VEC_DATA_TYPE(DATA_TYPE,4))(in30.s02, in31.s02);
-#define vec_conv_2d_1x1_compute_loop  \
-  for (int oc = 0; oc < 4; ++oc) {                             \
-    VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr + oc * in_chan_num); \
-    VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr + oc * out_pixel);       \
-    out += in0 * weights.x;                                    \
-    out += in1 * weights.y;                                     \
-    out += in2 * weights.z;                                     \
-    out += in3 * weights.w;                                     \
-    vstore4(out, 0, output_ptr + oc * out_pixel);               \
-  }
-#define vec_conv_2d_1x1_compute  \
-    VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr); \
-    VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr);       \
-    out += in0 * weights.x;                                    \
-    out += in1 * weights.y;                                     \
-    out += in2 * weights.z;                                     \
-    out += in3 * weights.w;                                     \
-    vstore4(out, 0, output_ptr);
-// Supported data type: half/float
-__kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
-                             __global const DATA_TYPE *filter, /* o, i, kh, kw */
-#ifdef BIAS
-                             __global const DATA_TYPE *bias, /* o */
-#endif /* defined(BIAS) */
-                             __global DATA_TYPE *output, /* n, c, h, w */
-                             __private const int in_chan_num,
-                             __private const int out_chan_num,
-                             __private const int in_height,
-                             __private const int in_width,
-                             __private const int out_height,
-                             __private const int out_width) {
-  int batch = get_global_id(0);
-  int out_chan_blk = get_global_id(1);
-  int out_pixel_blk = get_global_id(2);
-  const int in_pixel = in_height * in_width;
-  const int out_pixel = out_height * out_width;
-  const int round_out_width = (out_width + 3) / 4;
-  const int out_pixel_height = out_pixel_blk / round_out_width;
-  const int out_pixel_width = out_pixel_blk % round_out_width;
-  const int out_chan_begin = out_chan_blk * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
-  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-#ifdef STRIDE_1
-  const int stride = 1;
-#else
-  const int stride = 2;
-#endif
-  const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
-  const int in_offset = batch * in_chan_num * in_pixel;
-  const int out_offset = batch * out_chan_num * out_pixel;
-  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
-  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
-  int out_chan_len = out_chan_end - out_chan_begin;
-  int pixel_len = out_pixel_end - out_pixel_begin;
-  for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
-    DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-#ifdef BIAS
-    DATA_TYPE bias_value = bias[out_chan];
-#else
-    DATA_TYPE bias_value = 0;
-#endif
-    for (int p = 0; p < pixel_len; ++p) {
-      output_ptr[p] = bias_value;
-    }
-  }
-  int in_chan = 0;
-  if (pixel_len == 4) {
-    for (; in_chan + 3 < in_chan_num; in_chan += 4) {
-      const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
-      int out_chan = out_chan_begin;
-      for (; out_chan + 3 < out_chan_end; out_chan += 4) {
-        const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-        DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-#ifdef STRIDE_1
-        vec_conv_2d_1x1_s1;
-#else
-        vec_conv_2d_1x1_s2;
-#endif
-        vec_conv_2d_1x1_compute_loop;
-      }
-      for (; out_chan < out_chan_end; ++out_chan) {
-        const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-        DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-#ifdef STRIDE_1
-        vec_conv_2d_1x1_s1;
-#else
-        vec_conv_2d_1x1_s2;
-#endif
-        vec_conv_2d_1x1_compute;
-      }
-    }
-  }
-  for (; in_chan < in_chan_num; ++in_chan) {
-    const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
-    for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
-      DATA_TYPE weights = filter[out_chan * in_chan_num + in_chan];
-      DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-      for (int p = 0; p < pixel_len; ++p) {
-        float in = input_ptr[p*stride];
-        output_ptr[p] += in * weights;
-      }
-    }
-  }
-}
 __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
+#ifdef BIAS
                          __read_only image2d_t bias, /* cout%4 * cout/4 */
+#endif
                          __write_only image2d_t output,
+                          __private const int in_height,
+                          __private const int in_width,
                          __private const int in_ch_blks,
+                          __private const int height,
                          __private const int width) {
  const int out_ch_blk = get_global_id(0);
  const int out_w_blk = get_global_id(1);
@@ -154,151 +18,103 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  half4 bias_value = read_imageh(bias, sampler, (int2)(out_ch_blk, 0));
+#ifdef BIAS
-  half4 out[4];
+  DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
-  out[0] = (half4)(bias_value.x);
+  DATA_TYPE4 out1 = out0;
-  out[1] = (half4)(bias_value.y);
+  DATA_TYPE4 out2 = out0;
-  out[2] = (half4)(bias_value.z);
+  DATA_TYPE4 out3 = out0;
-  out[3] = (half4)(bias_value.w);
+#else
+  DATA_TYPE4 out0 = 0;
-  int w[4];
+  DATA_TYPE4 out1 = 0;
-  w[0] = out_w_blk;
+  DATA_TYPE4 out2 = 0;
-  w[1] = w[0] + out_w_blks;
+  DATA_TYPE4 out3 = 0;
-  w[2] = w[1] + out_w_blks;
+#endif
-  w[3] = w[2] + out_w_blks;
-  // Unrolling this loop hurt perfmance
-  int in_x_base = 0;
-  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    half4 in[4];
-    in[0] = read_imageh(input, sampler, (int2)(in_x_base + w[0], out_hb));
-    if (w[1] < width) {
-      // conditional load hurt perf, this branching helps sometimes
-      in[1] = read_imageh(input, sampler, (int2)(in_x_base + w[1], out_hb));
-      in[2] = read_imageh(input, sampler, (int2)(in_x_base + w[2], out_hb));
-      in[3] = read_imageh(input, sampler, (int2)(in_x_base + w[3], out_hb));
-    }
-    // The order matters, load input first then load filter, why?
-    const int filter_x0 = in_ch_blk << 2;
-    half4 weights[4];
-    #pragma unroll
-    for (int c = 0; c < 4; ++c) {
-      weights[c] = read_imageh(filter, sampler, (int2)(filter_x0 + c, out_ch_blk));
-    }
-    // Will prefetch L2 improve performance? How to pretch image data?
-    // Interleaving load and mul does not improve performance as expected
-    #pragma unroll
-    for (int c = 0; c < 4; ++c) {
-      out[c] += in[c].x * weights[0];
-      out[c] += in[c].y * weights[1];
-      out[c] += in[c].z * weights[2];
-      out[c] += in[c].w * weights[3];
-    }
-    in_x_base += width;
-  }
-  const int out_x_base = out_ch_blk * width;
-  write_imageh(output, (int2)(out_x_base + w[0], out_hb), out[0]);
-  if (w[1] >= width) return;
-  write_imageh(output, (int2)(out_x_base + w[1], out_hb), out[1]);
-  if (w[2] >= width) return;
-  write_imageh(output, (int2)(out_x_base + w[2], out_hb), out[2]);
-  if (w[3] >= width) return;
-  write_imageh(output, (int2)(out_x_base + w[3], out_hb), out[3]);
-}
-__kernel void conv_2d_1x1_h8(__read_only image2d_t input, /* [c%8 * w * c/8, h * b] */
-                             __read_only image2d_t filter, /* cout%8 * cin, cout/8 */
-                             __read_only image2d_t bias, /* cout%8 * cout/8 */
-                             __write_only image2d_t output,
-                             __private const int in_ch_blks,
-                             __private const int width) {
-  const int out_ch_blk = get_global_id(0);
-  const int out_w_blk = get_global_id(1);
-  const int out_w_blks = get_global_size(1);
-  const int out_hb = get_global_id(2);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int4 w;
+#if STRIDE == 1
+  w.x = out_w_blk;
+  w.y = w.x + out_w_blks;
+  w.z = w.y + out_w_blks;
+  w.w = w.z + out_w_blks;
+  int out_hb_idx = (out_hb % height);
+#else
+  w.x = out_w_blk * 2;
+  w.y = (out_w_blk + out_w_blks) * 2;
+  w.z = (out_w_blk + 2 * out_w_blks) * 2;
+  w.w = (out_w_blk + 3 * out_w_blks) * 2;
+  int out_hb_idx = (out_hb % height) * 2;
+#endif
-  float4 bias_value = read_imagef(bias, sampler, (int2)(out_ch_blk, 0));
+  w.x = select(w.x, INT_MIN, w.x >= in_width);
-  half4 bias_value03 = as_half4(bias_value.xy);
+  w.y = select(w.y, INT_MIN, w.y >= in_width);
-  half4 bias_value47 = as_half4(bias_value.zw);
+  w.z = select(w.z, INT_MIN, w.z >= in_width);
-  half4 out[8];
+  w.w = select(w.w, INT_MIN, w.w >= in_width);
-  out[0] = (half4)(bias_value03.x);
-  out[1] = (half4)(bias_value03.y);
-  out[2] = (half4)(bias_value03.z);
-  out[3] = (half4)(bias_value03.w);
-  out[4] = (half4)(bias_value47.x);
-  out[5] = (half4)(bias_value47.y);
-  out[6] = (half4)(bias_value47.z);
-  out[7] = (half4)(bias_value47.w);
-  int w[4];
+  out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
-  w[0] = out_w_blk;
+                      -1,
-  w[1] = w[0] + out_w_blks;
+                      out_hb_idx >= in_height);
-  w[2] = w[1] + out_w_blks;
-  w[3] = w[2] + out_w_blks;
  // Unrolling this loop hurt perfmance
  int in_x_base = 0;
  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    half4 in[8];
-    #pragma unroll
-    for (int wi = 0; wi < 4; ++wi) {
-      float4 in_value = read_imagef(input, sampler, (int2)(in_x_base + w[0], out_hb));
-      in[wi << 1] = as_half4(in_value.xy);
-      in[wi << 1 + 1] = as_half4(in_value.zw);
-    }
-    // The order matters, load input first then load filter, why?
+    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
+    DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
+    DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
+    DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
    const int filter_x0 = in_ch_blk << 2;
-    half4 weights[8];
+    DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
-    #pragma unroll
+    DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
-    for (int wi = 0; wi < 4; ++wi) {
+    DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
-      float4 weights_value = read_imagef(filter, sampler, (int2)(filter_x0 + wi, out_ch_blk));
+    DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
-      weights[wi << 1] = as_half4(weights_value.xy);
-      weights[wi << 1 + 1] = as_half4(weights_value.zw);
-    }
    // Will prefetch L2 improve performance? How to pretch image data?
-    // Interleaving load and mul does not improve performance as expected
+    out0 += in0.x * weights0;
-    #pragma unroll
+    out0 += in0.y * weights1;
-    for (int wi = 0; wi < 4; ++wi) {
+    out0 += in0.z * weights2;
-      int idx = wi << 1;
+    out0 += in0.w * weights3;
-      out[idx] += in[idx].x * weights[0];
-      out[idx] += in[idx].y * weights[1];
+    out1 += in1.x * weights0;
-      out[idx] += in[idx].z * weights[2];
+    out1 += in1.y * weights1;
-      out[idx] += in[idx].w * weights[3];
+    out1 += in1.z * weights2;
+    out1 += in1.w * weights3;
-      ++idx;
+    out2 += in2.x * weights0;
-      out[idx] += in[idx].x * weights[4];
+    out2 += in2.y * weights1;
-      out[idx] += in[idx].y * weights[5];
+    out2 += in2.z * weights2;
-      out[idx] += in[idx].z * weights[6];
+    out2 += in2.w * weights3;
-      out[idx] += in[idx].w * weights[7];
-    }
-    in_x_base += width;
+    out3 += in3.x * weights0;
+    out3 += in3.y * weights1;
+    out3 += in3.z * weights2;
+    out3 += in3.w * weights3;
+    in_x_base += in_width;
  }
+#ifdef FUSED_RELU
+  // TODO relux
+  out0 = fmax(out0, 0);
+  out1 = fmax(out1, 0);
+  out2 = fmax(out2, 0);
+  out3 = fmax(out3, 0);
+#endif
  const int out_x_base = out_ch_blk * width;
-  float4 out_value = (float4)(as_float2(out[0]), as_float2(out[1]));
+  int out_x_idx = out_w_blk;
-  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1);
-  if (w[1] >= width) return;
+  out_x_idx += out_w_blks;
-  out_value = (float4)(as_float2(out[2]), as_float2(out[3]));
+  if (out_x_idx >= width) return;
-  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2);
-  if (w[2] >= width) return;
+  out_x_idx += out_w_blks;
-  out_value = (float4)(as_float2(out[4]), as_float2(out[5]));
+  if (out_x_idx >= width) return;
-  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
-  if (w[3] >= width) return;
-  out_value = (float4)(as_float2(out[6]), as_float2(out[7]));
-  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
 }
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                          __write_only image2d_t output,
                          __private const int in_height,
                          __private const int in_width,
-                          __private const int in_channels,
+                          __private const int in_ch_blks,
                          __private const int out_height,
                          __private const int out_width,
                          __private const int padding_top,
@@ -17,120 +17,145 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const int out_w_blk = get_global_id(1);
  const int out_w_blks = get_global_size(1);
  const int out_hb = get_global_id(2);
-  const int in_ch_blks = (in_channels + 3) / 4;
  const int rounded_in_ch = in_ch_blks * 4;
  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  VEC_DATA_TYPE(DATA_TYPE, 4) out[4] = {0};
 #ifdef BIAS
-  out[0] =
+  DATA_TYPE4 out0 =
-      CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0));
+     READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
-  out[1] = out[0];
+  DATA_TYPE4 out1 = out0;
-  out[2] = out[0];
+  DATA_TYPE4 out2 = out0;
-  out[3] = out[0];
+  DATA_TYPE4 out3 = out0;
+  DATA_TYPE4 out4 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+  DATA_TYPE4 out4 = 0;
+#endif
+#if STRIDE == 1
+  int in_width0 = out_w_blk - padding_left;
+  int in_width1 = in_width0 + out_w_blks;
+  int in_width2 = in_width1 + out_w_blks;
+  int in_width3 = in_width2 + out_w_blks;
+  int in_width4 = in_width3 + out_w_blks;
+  const int height_idx = (out_hb % out_height) - padding_top;
+#else
+  int in_width0 = out_w_blk * 2 - padding_left;
+  int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
+  int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
+  int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
+  int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left;
+  const int height_idx = (out_hb % out_height) * 2 - padding_top;
 #endif
-  int w[4];
+  const int batch_idx = (out_hb / out_height) * in_height;
-  w[0] = out_w_blk - padding_left;
-  w[1] = w[0] + out_w_blks;
-  w[2] = w[1] + out_w_blks;
-  w[3] = w[2] + out_w_blks;
-  const int batch_idx = out_hb / out_height;
-  const int height_idx = out_hb % out_height;
-  int in_hb[3];
-  in_hb[0] = height_idx - padding_top;
-  in_hb[1] = in_hb[0] + 1;
-  in_hb[2] = in_hb[1] + 1;
-  // Judge the height border for padding input.
-  in_hb[0] = (in_hb[0] < 0 || in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height;
-  in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height;
-  in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height;
-  const int input_image_width = in_ch_blks * in_width;
+  DATA_TYPE4 in0, in1, in2, in3, in4;
+  DATA_TYPE4 weights0, weights1, weights2, weights3;
+  int in_idx, hb_idx, width_idx, in_width_idx;
  // Unrolling this loop hurt perfmance
-  int idx = 0;
+  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
-    VEC_DATA_TYPE(DATA_TYPE, 4) in[36];
+      int in_hb_value = height_idx + hb_idx;
-    VEC_DATA_TYPE(DATA_TYPE, 4) weights[36];
+      in_hb_value = select(in_hb_value + batch_idx,
+                           -1,
-    int filter_idx = in_ch_blk << 2;
+                           (in_hb_value < 0 || in_hb_value >= in_height));
-    int in_idx = in_ch_blk * in_width;
+      for (short width_idx = 0; width_idx < 3; ++width_idx) {
-    #pragma unroll
+        in_idx = in_ch_blk * in_width;
-    for (int i = 0; i < 3; ++i) {
+        int in_width_value;
-      for (int j = 0; j < 3; ++j) {
+#define READ_INPUT(i)                                                                \
-        idx = i * 12 + j * 4;
+        in_width_value = in_width##i + width_idx;                                    \
-        int in_width_idx = w[0] + j;
+        in_width_value = select(in_idx + in_width_value,                             \
-        // Judge the width border for padding input.
+                                -1,                                                  \
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
+                                (in_width_value < 0 || in_width_value >= in_width)); \
-          in[idx + 0] = 0;
+        in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
-        } else {
-          in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
+        READ_INPUT(0);
-        }
+        READ_INPUT(1);
-        in_width_idx = w[1] + j;
+        READ_INPUT(2);
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
+        READ_INPUT(3);
-          in[idx + 1] = 0;
+        READ_INPUT(4);
-        } else {
-          in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
+#undef READ_INPUT
-        }
-        in_width_idx = w[2] + j;
+        int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
+        weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-          in[idx + 2] = 0;
+        weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
-        } else {
+        weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
-          in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
+        weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
-        }
-        in_width_idx = w[3] + j;
+        // Will prefetch L2 improve performance? How to pretch image data?
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 3] = 0;
+        // Interleaving load and mul does not improve performance as expected
-        } else {
+        out0 += in0.x * weights0;
-          in[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
+        out0 += in0.y * weights1;
-        }
+        out0 += in0.z * weights2;
+        out0 += in0.w * weights3;
-        weights[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-        weights[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
+        out1 += in1.x * weights0;
-        weights[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
+        out1 += in1.y * weights1;
-        weights[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+        out1 += in1.z * weights2;
+        out1 += in1.w * weights3;
-        filter_idx += rounded_in_ch;
-      }
+        out2 += in2.x * weights0;
-    }
+        out2 += in2.y * weights1;
-    // Will prefetch L2 improve performance? How to pretch image data?
+        out2 += in2.z * weights2;
+        out2 += in2.w * weights3;
-    // Interleaving load and mul does not improve performance as expected
-    #pragma unroll
+        out3 += in3.x * weights0;
-    for (int c = 0; c < 4; ++c) {
+        out3 += in3.y * weights1;
-      for (int i = 0; i < 9; ++i) {
+        out3 += in3.z * weights2;
-        out[c] += in[c + i * 4].x * weights[0 + i * 4];
+        out3 += in3.w * weights3;
-        out[c] += in[c + i * 4].y * weights[1 + i * 4];
-        out[c] += in[c + i * 4].z * weights[2 + i * 4];
+        out4 += in4.x * weights0;
-        out[c] += in[c + i * 4].w * weights[3 + i * 4];
+        out4 += in4.y * weights1;
+        out4 += in4.z * weights2;
+        out4 += in4.w * weights3;
      }
    }
  }
+#ifdef FUSED_RELU
+  // TODO relux
+  out0 = fmax(out0, 0);
+  out1 = fmax(out1, 0);
+  out2 = fmax(out2, 0);
+  out3 = fmax(out3, 0);
+  out4 = fmax(out4, 0);
+#endif
  const int out_x_base = out_ch_blk * out_width;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
+  int w = out_w_blk;
-                                       (int2)(out_x_base + w[0] + padding_left, out_hb),
+  WRITE_IMAGET(output,
-                                       out[0]);
+               (int2)(out_x_base + w, out_hb),
+               out0);
-  w[1] += padding_left;
-  if (w[1] >= out_width) return;
+  w += out_w_blks;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
+  if (w >= out_width) return;
-                                       (int2)(out_x_base + w[1], out_hb),
+  WRITE_IMAGET(output,
-                                       out[1]);
+               (int2)(out_x_base + w, out_hb),
+               out1);
-  w[2] += padding_left;
-  if (w[2] >= out_width) return;
+  w += out_w_blks;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
+  if (w >= out_width) return;
-                                       (int2)(out_x_base + w[2], out_hb),
+  WRITE_IMAGET(output,
-                                       out[2]);
+               (int2)(out_x_base + w, out_hb),
+               out2);
-  w[3] += padding_left;
-  if (w[3] >= out_width) return;
+  w += out_w_blks;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
+  if (w >= out_width) return;
-                                       (int2)(out_x_base + w[3], out_hb),
+  WRITE_IMAGET(output,
-                                       out[3]);
+               (int2)(out_x_base + w, out_hb),
+               out3);
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out4);
 }
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
 #include <common.h>
-VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) {
+#ifdef FP16
-  VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr);
+#define MIN_VALUE -USHRT_MAX
-  VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4);
-  VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width);
-  VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4);
-  VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2);
-  VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4);
-  VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323);
-  VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01);
-  VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323);
-  VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01);
-  VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323);
-  VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01);
-  VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
-  VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
-  VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
-                                        (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
-  res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
-  return res;
-}
-VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) {
-  VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr);
-  DATA_TYPE row01 = *(input_ptr + 8);
-  VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width);
-  DATA_TYPE row11 = *(input_ptr + in_width + 8);
-  VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2);
-  DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8);
-  VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445);
-  VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01);
-  VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445);
-  VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11);
-  VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445);
-  VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21);
-  VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
-  VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
-  VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
-                                        (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
-  res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
-  return res;
-}
-DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) {
-  VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr);
-  VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width);
-  VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2);
-  VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2);
-  DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2);
-  return res;
-}
-// Supported data type: half/float
-__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */
-                       __private const int in_height,
-                       __private const int in_width,
-                       __private const int out_chan_num,
-                       __private const int out_height,
-                       __private const int out_width,
-                       __private const int stride,
-                       __global DATA_TYPE *output) {
-  int batch = get_global_id(0);
-  int out_chan_blk = get_global_id(1);
-  int out_pixel_blk = get_global_id(2);
-  const int round_out_width = (out_width + 3) / 4;
-  const int out_pixel_height = out_pixel_blk / round_out_width;
-  const int out_pixel_width = out_pixel_blk % round_out_width;
-  const int out_chan_begin = out_chan_blk * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
-  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-  const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
-  const int in_pixel = in_height * in_width;
-  const int out_pixel = out_height * out_width;
-  const int in_offset = batch * out_chan_num * in_pixel;
-  const int out_offset = batch * out_chan_num * out_pixel;
-  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
-  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
-  const int pixels = out_pixel_end - out_pixel_begin;
-  for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    const DATA_TYPE *input_ptr = input_base + i * in_pixel;
-    DATA_TYPE *output_ptr = output_base + i * out_pixel;
-    if (pixels == 4) {
-      VEC_DATA_TYPE(DATA_TYPE,4) res;
-#ifdef STRIDE_1
-      res = vec_pooling_3_s1(input_ptr, in_width);
 #else
-      res = vec_pooling_3_s2(input_ptr, in_width);
+#define MIN_VALUE -FLT_MAX
 #endif
-      vstore4(res, 0, output_ptr);
-    } else {
-      for (int p = 0; p < pixels; ++p) {
-        output_ptr[p] = inner_pooling_3(input_ptr, in_width);
-        input_ptr += stride;
-      }
-    }
-  }
-}
-int calculate_avg_block_size(const int pos_h,
+inline int calculate_avg_block_size(const int pool_size,
-                             const int pos_w,
+                                    const int pos_h,
-                             const int pool_size,
+                                    const int pos_w,
-                             const int pad_h,
+                                    const int h_size,
-                             const int pad_w,
+                                    const int w_size) {
-                             const int h_size,
+  const int h_start = max(0, pos_h);
-                             const int w_size) {
+  const int w_start = max(0, pos_w);
-  const int h_start = max(0, pos_h - pad_h);
+  const int h_end = min(pos_h + pool_size, h_size);
-  const int w_start = max(0, pos_w - pad_w);
+  const int w_end = min(pos_w + pool_size, w_size);
-  const int h_end = min(pos_h + pool_size - pad_h, h_size);
-  const int w_end = min(pos_w + pool_size - pad_w, w_size);
  return (h_end - h_start) * (w_end - w_start);
 }
 // Supported data type: half/float
-__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */
+__kernel void pooling(__read_only image2d_t input,
-                       __private const int in_height,
+                      __private const int in_height,
-                       __private const int in_width,
+                      __private const int in_width,
-                       __private const int out_chan_num,
+                      __private const int out_height,
-                       __private const int out_height,
+                      __private const int pad_top,
-                       __private const int out_width,
+                      __private const int pad_left,
-                       __private const int stride,
+                      __private const int stride,
-                       __private const int pad_h,
+                      __private const int pooling_size,
-                       __private const int pad_w,
+                      __write_only image2d_t output) {
-                       __private const int pooling_size,
+  const int out_chan_idx = get_global_id(0);
-                       __global DATA_TYPE *output) {
+  const int out_width_idx = get_global_id(1);
-  int batch = get_global_id(0);
+  const int out_width = get_global_size(1);
-  int out_chan_idx = get_global_id(1);
+  const int out_hb_idx = get_global_id(2);
-  int out_pixel_idx = get_global_id(2);
+  const int batch_idx = (out_hb_idx / out_height) * in_height;
-  const int out_pixel_height = out_pixel_idx / out_width;
+  const int in_height_start = (out_hb_idx % out_height) * stride - pad_top;
-  const int out_pixel_width = out_pixel_idx % out_width;
+  const int in_width_start = out_width_idx * stride - pad_left;
+  const int in_channel_offset = out_chan_idx * in_width;
-  const int out_chan_begin = out_chan_idx * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int in_pixel_idx = out_pixel_height * stride * in_width
+#ifdef POOL_AVG
-                             + out_pixel_width * stride;
+  DATA_TYPE4 res = 0;
+  for (int height = 0; height < pooling_size; ++height) {
-  const int in_pixel = in_height * in_width;
+    int in_height_idx = in_height_start + height;
-  const int out_pixel = out_height * out_width;
+    in_height_idx = select(batch_idx + in_height_idx,
+                       -1,
-  const int in_offset = batch * out_chan_num * in_pixel;
+                       (in_height_idx < 0 || in_height_idx >= in_height));
-  const int out_offset = batch * out_chan_num * out_pixel;
+    for (int width = 0; width < pooling_size; ++width) {
-  const DATA_TYPE *input_base = input + in_offset + in_pixel_idx;
+      int in_width_idx = in_width_start + width;
-  DATA_TYPE *output_base = output + out_offset + out_pixel_idx;
+      in_width_idx = select(in_channel_offset + in_width_idx,
+                            -1,
-  const int block_size = calculate_avg_block_size(
+                            (in_width_idx < 0 || in_width_idx >= in_width));
-                            out_pixel_height * stride,
-                            out_pixel_width * stride,
+      DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
-                            pooling_size,
+      res = res + in;
-                            pad_h/2,
+    }
-                            pad_w/2,
+  }
-                            in_height - pad_h,
+  const int block_size = calculate_avg_block_size(pooling_size,
-                            in_width - pad_w);
+                                                  in_height_start, in_width_start,
-  for (int i = out_chan_begin; i < out_chan_end; ++i) {
+                                                  in_height, in_width);
-    VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f;
+  res /= block_size;
-    DATA_TYPE sum1 = 0.0f;
+#else
-    DATA_TYPE *output_ptr = output_base + i * out_pixel;
+  DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE);
-    for (int y = 0; y < pooling_size; ++y) {
+  for (int height = 0; height < pooling_size; ++height) {
-      const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width;
+    int in_height_idx = in_height_start + height;
-      int x = 0;
+    in_height_idx = select(batch_idx + in_height_idx,
-      for (; x < (pooling_size-8); x += 8) {
+                           -1,
-        VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr);
+                           (in_height_idx < 0 || in_height_idx >= in_height));
-        sum8 += data;
+    if (in_height_idx != -1) {
-        input_ptr += 8;
+      for (int width = 0; width < pooling_size; ++width) {
-      }
+        int in_width_idx = in_width_start + width;
-      for (; x < pooling_size; ++x) {
+        in_width_idx = select(in_channel_offset + in_width_idx,
-        sum1 += *input_ptr;
+                              -1,
-        input_ptr++;
+                              (in_width_idx < 0 || in_width_idx >= in_width));
+        if (in_width_idx != -1) {
+          DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
+          res = fmax(res, in);
+        }
      }
    }
-    VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567;
-    VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23;
-    *output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size;
  }
+#endif
+  WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res);
 }
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
 #include <common.h>
-// Supported data type: half/float
+__kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
-__kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, h, w */
+                                      __write_only image2d_t output,
-                                      __global DATA_TYPE *output /* n * c, h, w */,
                                      __private const float height_scale,
                                      __private const float width_scale,
                                      __private const int in_height,
-                                      __private const int in_width) {
+                                      __private const int in_width,
-  const int c = get_global_id(0);
+                                      __private const int out_height) {
-  const int h = get_global_id(1);
+  const int ch_blk = get_global_id(0);
-  const int w = get_global_id(2);
+  const int ch_blks = get_global_size(0);
-  const int channels = get_global_size(0);
+  const int w = get_global_id(1);
-  const int height = get_global_size(1);
+  const int out_width = get_global_size(1);
-  const int width = get_global_size(2);
+  const int hb = get_global_id(2);
+  const int b = hb / out_height;
+  const int h = hb % out_height;
  const float h_in = h * height_scale;
  const float w_in = w * width_scale;
@@ -24,16 +25,26 @@ __kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c,
  const float h_lerp = h_in - h_lower;
  const float w_lerp = w_in - w_lower;
-  const DATA_TYPE *input_base = input + c * in_height * in_width;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  DATA_TYPE *output_base = output + c * height * width;
+  const int in_w_offset = ch_blk * in_width;
+  const int in_h_offset = b * in_height;
-  DATA_TYPE top_left = input_base[h_lower * in_width + w_lower];
+  DATA_TYPE4 top_left = READ_IMAGET(input, sampler,
-  DATA_TYPE top_right = input_base[h_lower * in_width + w_upper];
+          (int2)(in_w_offset + w_lower, in_h_offset + h_lower));
-  DATA_TYPE bottom_left = input_base[h_upper * in_width + w_lower];
+  DATA_TYPE4 top_right = READ_IMAGET(input, sampler,
-  DATA_TYPE bottom_right = input_base[h_upper * in_width + w_upper];
+          (int2)(in_w_offset + w_upper, in_h_offset + h_lower));
+  DATA_TYPE4 bottom_left = READ_IMAGET(input, sampler,
+          (int2)(in_w_offset + w_lower, in_h_offset + h_upper));
+  DATA_TYPE4 bottom_right = READ_IMAGET(input, sampler,
+          (int2)(in_w_offset + w_upper, in_h_offset + h_upper));
-  const DATA_TYPE top = top_left + (top_right - top_left) * w_lerp;
+  DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp;
-  const DATA_TYPE bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
+  DATA_TYPE4 bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
-  output_base[h * width + w] = top + (bottom - top) * h_lerp;
+  DATA_TYPE4 out = top + (bottom - top) * h_lerp;
+  const int out_w_offset = ch_blk * out_width;
+  const int out_h_offset = b * out_height;
+  WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
 }
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -9,50 +9,56 @@ namespace mace {
 namespace kernels {
 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
-                               const Tensor *bias, const int *padding,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
                               Tensor *output);
 extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
-                               const Tensor *bias, const int *padding,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
                               Tensor *output);
 extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
-                               const Tensor *bias, const int *padding,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
                               Tensor *output);
 extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
-                               const Tensor *bias, const int *padding,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
                               Tensor *output);
-template <>
+extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
-void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
+                         const Tensor *bias, const bool fused_relu,
-                                                          const Tensor *filter,
+                         const uint32_t stride, const int *padding,
-                                                          const Tensor *bias,
+                         const DataType dt, Tensor *output);
-                                                          Tensor *output) {
+template<typename T>
+void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+                                                      const Tensor *filter,
+                                                      const Tensor *bias,
+                                                      Tensor *output) {
  typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
-                                       const Tensor *bias, const int *padding,
+                                       const Tensor *bias, const bool fused_relu,
+                                       const int *padding, const DataType dt,
                                       Tensor *output);
  // Selection matrix: kernel_size x stride_size
  static const Conv2dOpenclFunction selector[5][2] = {
      {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
      {nullptr, nullptr},
-      {Conv2dOpenclK3x3S1, nullptr},
+      {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
      {nullptr, nullptr},
      {nullptr, nullptr}};
  index_t kernel_h = filter->dim(0);
  index_t kernel_w = filter->dim(1);
-  if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
+  if (!input->is_image() || strides_[0] != strides_[1] ||
-      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
+      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) {
-      selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
    LOG(WARNING) << "OpenCL conv2d kernel with "
                 << "filter" << kernel_h << "x" << kernel_w << ","
                 << " stride " << strides_[0] << "x" << strides_[1]
                 << " is not implemented yet, using slow version";
-    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
+    MACE_NOT_IMPLEMENTED;
-    Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
-        input, filter, bias, output);
-    return;
  }
  std::vector<index_t> output_shape(4);
@@ -61,17 +67,24 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
      input->shape().data(), filter->shape().data(), dilations_,
      strides_, paddings_, output_shape.data(), paddings.data());
-  if (input->is_image()) {
+  std::vector<size_t> output_image_shape;
-    std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
-    CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+  output->ResizeImage(output_shape, output_image_shape);
-    output->ResizeImage(output_shape, output_image_shape);
+  if (kernel_h == kernel_w && kernel_h <= 5 &&
+      selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
+    auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
+    conv2d_func(input, filter, bias, false, paddings.data(), DataTypeToEnum<T>::value, output);
  } else {
-    output->Resize(output_shape);
+    Conv2dOpencl(input, filter, bias, false, strides_[0], paddings.data(), DataTypeToEnum<T>::value, output);
  }
-  auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-  conv2d_func(input, filter, bias, paddings.data(), output);
 }
+template
+struct Conv2dFunctor<DeviceType::OPENCL, float>;
+template
+struct Conv2dFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -5,83 +5,44 @@
 #include "mace/kernels/conv_2d.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/utils/utils.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-void Conv1x1V2(const Tensor *input,
+void Conv1x1(const Tensor *input,
-               const Tensor *filter,
+             const Tensor *filter,
-               const Tensor *bias,
+             const Tensor *bias,
-               const int stride,
+             const bool fused_relu,
-               Tensor *output) {
+             const int stride,
+             const DataType dt,
+             Tensor *output) {
  const index_t batch = output->dim(0);
-  const index_t channels = output->dim(1);
+  const index_t height = output->dim(1);
-  const index_t height = output->dim(2);
+  const index_t width = output->dim(2);
-  const index_t width = output->dim(3);
+  const index_t channels = output->dim(3);
-  const index_t input_channels = input->dim(1);
+  const index_t input_batch = input->dim(0);
+  const index_t input_height = input->dim(1);
-  auto runtime = OpenCLRuntime::Get();
+  const index_t input_width = input->dim(2);
-  auto program = runtime->program();
+  const index_t input_channels = input->dim(3);
-  const index_t channel_blocks = (channels + 3) / 4;
-  const index_t pixel_blocks = (width + 3) / 4 * height;
-  // TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
-  // TODO check wired clReleaseCommandQueue latency
-  // The KernelFunctor can cause segment faults in cb_retain_event
-  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
-  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-  auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1_v2", built_options);
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
-  uint32_t idx = 0;
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Buffer *>(input->buffer())));
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Buffer *>(filter->buffer())));
-  if (bias != nullptr) {
-    conv_2d_kernel.setArg(idx++,
-                          *(static_cast<const cl::Buffer *>(bias->buffer())));
-  }
-  conv_2d_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
-  auto command_queue = runtime->command_queue();
-  cl_int error = command_queue.enqueueNDRangeKernel(
-      conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<int>(batch), static_cast<int>(channel_blocks),
-                  static_cast<int>(pixel_blocks)),
-      cl::NDRange(1, 2, kwg_size / 2),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS, error);
-}
-void Conv1x1V3(const Tensor *input,
-               const Tensor *filter,
-               const Tensor *bias,
-               const int stride,
-               Tensor *output) {
-  const index_t batch = output->dim(0);
-  const index_t channels = output->dim(1);
-  const index_t height = output->dim(2);
-  const index_t width = output->dim(3);
-  const index_t input_channels = input->dim(1);
  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_blocks = RoundUpDiv4(width);
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
+  MACE_CHECK(input_batch == batch);
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DSTRIDE_1");
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
+  if (bias != nullptr) {
+    built_options.emplace("-DBIAS");
+  }
+  if (fused_relu) {
+    built_options.emplace("-DFUSED_RELU");
+  }
  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();
@@ -96,47 +57,42 @@ void Conv1x1V3(const Tensor *input,
    conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
  }
  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_height));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_width));
  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
  auto command_queue = runtime->command_queue();
  cl_int error;
  error = command_queue.enqueueNDRangeKernel(
      conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(height),
+      cl::NDRange(static_cast<uint32_t>(channel_blocks),
+                  static_cast<uint32_t>(width_blocks),
                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(4, 15, 8),
+      cl::NDRange(4, 15, 8), // TODO auto tuning
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }
 extern void Conv2dOpenclK1x1S1(const Tensor *input,
                               const Tensor *filter,
                               const Tensor *bias,
+                               const bool fused_relu,
                               const int *padding,
+                               const DataType dt,
                               Tensor *output) {
-  const index_t batch = output->dim(0);
+  Conv1x1(input, filter, bias, fused_relu, 1, dt, output);
-  const index_t height = output->dim(2);
-  const index_t width = output->dim(3);
-  const index_t input_batch = input->dim(0);
-  const index_t input_height = input->dim(2);
-  const index_t input_width = input->dim(3);
-  MACE_CHECK(input_batch == batch && input_height == height &&
-             input_width == width);
-  Conv1x1V2(input, filter, bias, 1, output);
 };
 extern void Conv2dOpenclK1x1S2(const Tensor *input,
                               const Tensor *filter,
                               const Tensor *bias,
+                               const bool fused_relu,
                               const int *padding,
+                               const DataType dt,
                               Tensor *output) {
-  MACE_CHECK(input->dim(0) == output->dim(0));
+  Conv1x1(input, filter, bias, fused_relu, 2, dt, output);
-  Conv1x1V2(input, filter, bias, 2, output);
 };
 }  // namespace kernels

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace kernels {
 static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
-                         const Tensor *bias, const uint32_t stride,
+                         const Tensor *bias, const bool fused_relu,
-                         const int *padding, Tensor *output) {
+                         const uint32_t stride, const int *padding,
+                         const DataType dt, Tensor *output) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -22,18 +23,21 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
  const index_t channel_blocks = RoundUpDiv4(channels);
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
-  const index_t width_blocks = RoundUpDiv4(width);
+  const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
+  if (fused_relu) {
+    built_options.emplace("-DFUSED_RELU");
+  }
  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();
  auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
  uint32_t idx = 0;
  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
@@ -44,7 +48,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
  conv_2d_kernel.setArg(idx++, padding[0] / 2);
@@ -56,18 +60,29 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
      conv_2d_kernel, cl::NullRange,
      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(4, 15, 8),
+      cl::NDRange(16, 16, 4),
      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }
-void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
+void Conv2dOpenclK3x3S1(const Tensor *input,
-                        const Tensor *bias, const int *padding, Tensor *output) {
+                        const Tensor *filter,
-  Conv2d3x3S12(input, filter, bias, 1, padding, output);
+                        const Tensor *bias,
+                        const bool fused_relu,
+                        const int *padding,
+                        const DataType dt,
+                        Tensor *output) {
+  Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output);
 };
-void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+void Conv2dOpenclK3x3S2(const Tensor *input,
-                        const Tensor *bias, const int *padding, Tensor *output) {
+                        const Tensor *filter,
+                        const Tensor *bias,
+                        const bool fused_relu,
+                        const int *padding,
+                        const DataType dt,
+                        Tensor *output) {
+  Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output);
 };
 }  // namespace kernels

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#include "mace/core/common.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
+namespace mace {
+namespace kernels {
+void Conv2dOpencl(const Tensor *input, const Tensor *filter,
+                  const Tensor *bias, const bool fused_relu,
+                  const uint32_t stride, const int *padding,
+                  const DataType dt, Tensor *output) {
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+  const index_t input_channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
+  const index_t width_blocks = RoundUpDiv4(width);
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
+  if (fused_relu) {
+    built_options.emplace("-DFUSED_RELU");
+  }
+  auto runtime = OpenCLRuntime::Get();
+  auto program = runtime->program();
+  auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options);
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
+  uint32_t idx = 0;
+  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
+  if (bias != nullptr) {
+    conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+  }
+  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(0)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(1)));
+  conv_2d_kernel.setArg(idx++, padding[0] / 2);
+  conv_2d_kernel.setArg(idx++, padding[1] / 2);
+  auto command_queue = runtime->command_queue();
+  cl_int error;
+  error = command_queue.enqueueNDRangeKernel(
+      conv_2d_kernel, cl::NullRange,
+      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
+                  static_cast<uint32_t>(height * batch)),
+      cl::NDRange(16, 16, 4),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+  MACE_CHECK(error == CL_SUCCESS, error);
+}
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -32,7 +32,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
  auto conv_kernel  = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options);

--- a/mace/kernels/opencl/fused_conv_2d_opencl.cc
+++ b/mace/kernels/opencl/fused_conv_2d_opencl.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#include "mace/kernels/fused_conv_2d.h"
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
+                               Tensor *output);
+extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
+                               Tensor *output);
+extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
+                               Tensor *output);
+extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
+                               Tensor *output);
+template<typename T>
+void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+                                                           const Tensor *filter,
+                                                           const Tensor *bias,
+                                                           Tensor *output) {
+  typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
+                                       const Tensor *bias, const bool fused_relu,
+                                       const int *padding, const DataType dt,
+                                       Tensor *output);
+  // Selection matrix: kernel_size x stride_size
+  static const Conv2dOpenclFunction selector[5][2] = {
+      {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
+      {nullptr, nullptr},
+      {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
+      {nullptr, nullptr},
+      {nullptr, nullptr}};
+  index_t kernel_h = filter->dim(0);
+  index_t kernel_w = filter->dim(1);
+  if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
+      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
+      selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
+    LOG(WARNING) << "OpenCL conv2d kernel with "
+                 << "filter" << kernel_h << "x" << kernel_w << ","
+                 << " stride " << strides_[0] << "x" << strides_[1]
+                 << " is not implemented yet, using slow version";
+    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
+    FusedConv2dFunctor<DeviceType::CPU, T>(strides_, paddings_, dilations_)(
+        input, filter, bias, output);
+    return;
+  }
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  kernels::CalcNHWCPaddingAndOutputSize(
+      input->shape().data(), filter->shape().data(), dilations_,
+      strides_, paddings_, output_shape.data(), paddings.data());
+  if (input->is_image()) {
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+    output->ResizeImage(output_shape, output_image_shape);
+  } else {
+    output->Resize(output_shape);
+  }
+  auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
+  conv2d_func(input, filter, bias, true, paddings.data(), DataTypeToEnum<T>::value, output);
+}
+template
+struct FusedConv2dFunctor<DeviceType::OPENCL, float>;
+template
+struct FusedConv2dFunctor<DeviceType::OPENCL, half>;
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -54,35 +54,19 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
 }
-std::string DataTypeToCLType(const DataType dt) {
+std::string DtToCLDt(const DataType dt) {
  switch (dt) {
    case DT_FLOAT:
      return "float";
    case DT_HALF:
      return "half";
-    case DT_UINT8:
-      return "uchar";
-    case DT_INT8:
-      return "char";
-    case DT_DOUBLE:
-      return "double";
-    case DT_INT32:
-      return "int";
-    case DT_UINT32:
-      return "int";
-    case DT_UINT16:
-      return "ushort";
-    case DT_INT16:
-      return "short";
-    case DT_INT64:
-      return "long";
    default:
      LOG(FATAL) << "Unsupported data type";
      return "";
  }
 }
-std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
+std::string DtToCLCMDDt(const DataType dt) {
  switch (dt) {
    case DT_FLOAT:
      return "f";
@@ -94,5 +78,27 @@ std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
  }
 }
+std::string DtToUpstreamCLDt(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+    case DT_HALF:
+      return "float";
+    default:
+      LOG(FATAL) << "Unsupported data type";
+      return "";
+  }
+}
+std::string DtToUpstreamCLCMDDt(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+    case DT_HALF:
+      return "f";
+    default:
+      LOG(FATAL) << "Not supported data type for opencl cmd data type";
+      return "";
+  }
+}
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -19,10 +19,13 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,
                     std::vector<size_t> &image_shape);
-std::string DataTypeToOPENCLCMDDataType(const DataType dt);
+std::string DtToCLCMDDt(const DataType dt);
-std::string DataTypeToCLType(const DataType dt);
+std::string DtToUpstreamCLCMDDt(const DataType dt);
+std::string DtToCLDt(const DataType dt);
+std::string DtToUpstreamCLDt(const DataType dt);
 }  // namespace kernels
 } //  namespace mace

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -10,131 +10,94 @@
 namespace mace {
 namespace kernels {
-static void Pooling3(const Tensor *input,
+static void Pooling(const Tensor *input,
-                     const int *stride,
+                    const int *stride,
-                     const PoolingType type,
+                    const int *paddings,
-                     Tensor *output) {
+                    const int pooling_size,
-  if (type != MAX) {
+                    const PoolingType type,
-    MACE_NOT_IMPLEMENTED;
+                    const DataType dt,
-  }
+                    Tensor *output) {
  index_t batch = output->dim(0);
-  index_t channels = output->dim(1);
+  index_t out_height = output->dim(1);
-  index_t out_height = output->dim(2);
+  index_t out_width = output->dim(2);
-  index_t out_width = output->dim(3);
+  index_t channels = output->dim(3);
-  index_t channel_blk = (channels + 3) / 4;
+  index_t channel_blocks = (channels + 3) / 4;
-  const index_t pixel_width = (out_width + 3) / 4 ;
  const uint32_t gws[3] = {
-      static_cast<uint32_t>(batch),
+      static_cast<uint32_t>(channel_blocks),
-      static_cast<uint32_t>(channel_blk),
+      static_cast<uint32_t>(out_width),
-      static_cast<uint32_t>(pixel_width * out_height),
+      static_cast<uint32_t>(batch * out_height),
  };
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  if (type == MAX && input->dtype() == output->dtype()) {
-  built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : "");
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-  auto pooling_kernel  = runtime->BuildKernel("pooling", "pooling3", built_options);
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
+  } else {
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  }
+  if (type == AVG) {
+    built_options.emplace("-DPOOL_AVG");
+  }
+  auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options);
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
-  const uint32_t lws[3] = {1, 8, 128};
+  uint32_t lws[3];
+  lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+  lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
+  lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
  uint32_t idx = 0;
-  pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
+  pooling_kernel.setArg(idx++, paddings[0] / 2);
+  pooling_kernel.setArg(idx++, paddings[1] / 2);
  pooling_kernel.setArg(idx++, stride[0]);
-  pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  pooling_kernel.setArg(idx++, pooling_size);
+  pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      pooling_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
      cl::NDRange(lws[0], lws[1], lws[2]),
      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
+  MACE_CHECK(error == CL_SUCCESS) << error;
 }
-static void PoolingN(const Tensor *input,
+template<typename T>
-                     const int *stride,
+void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
-                     const int *paddings,
+                                                       Tensor *output) {
-                     const int pooling_size,
+  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet";
-                     const PoolingType type,
+  std::vector<index_t> output_shape(4);
-                     Tensor *output) {
+  std::vector<int> paddings(2);
-  if (type != AVG) {
+  std::vector<index_t> filter_shape = {
-    MACE_NOT_IMPLEMENTED;
+      kernels_[0], kernels_[1],
-  }
+      input->dim(3), input->dim(3)
-  index_t batch = output->dim(0);
-  index_t channels = output->dim(1);
-  index_t out_height = output->dim(2);
-  index_t out_width = output->dim(3);
-  index_t channel_blk = (channels + 3) / 4;
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(batch),
-      static_cast<uint32_t>(channel_blk),
-      static_cast<uint32_t>(out_height * out_width),
  };
-  auto runtime = OpenCLRuntime::Get();
+  kernels::CalcNHWCPaddingAndOutputSize(
-  std::set<std::string> built_options;
+      input->shape().data(), filter_shape.data(),
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+      dilations_, strides_, this->padding_,
-  auto pooling_kernel  = runtime->BuildKernel("pooling", "poolingn", built_options);
+      output_shape.data(), paddings.data());
-  const uint32_t lws[3] = {1, 8, 128};
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+  output->ResizeImage(output_shape, output_image_shape);
-  uint32_t idx = 0;
+  Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
-  pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+          DataTypeToEnum<T>::value, output);
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
-  pooling_kernel.setArg(idx++, stride[0]);
-  pooling_kernel.setArg(idx++, paddings[0]);
-  pooling_kernel.setArg(idx++, paddings[1]);
-  pooling_kernel.setArg(idx++, pooling_size);
-  pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      pooling_kernel, cl::NullRange,
-      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
-}
-template <>
-void PoolingFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
-                                                          Tensor *output) {
-  int paddings[2];
-  std::vector<index_t> filter_shape = {input->dim(1), input->dim(0),
-                                       kernels_[0], kernels_[1]};
-  kernels::CalPaddingSize(input->shape().data(), filter_shape.data(), this->dilations_,
-                          strides_, this->padding_, paddings);
-#define POOLING_HELPER                                               \
-  switch(kernels_[0]) {                                              \
-    case 3:                                                          \
-      Pooling3(input, strides_, pooling_type_, output);              \
-      break;                                                         \
-    default:                                                         \
-      PoolingN(input, strides_, paddings, kernels_[0],               \
-               pooling_type_, output);                               \
-      break;                                                         \
-  }
-  if (paddings[0] > 0 || paddings[1] > 0) {
-    Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
-    ConstructInputWithPadding(input, paddings, &padded_input, pooling_type_ == MAX);
-    input = &padded_input;
-    POOLING_HELPER
-  } else {
-    POOLING_HELPER
-  }
-#undef POOLING_HELPER
 }
+template
+struct PoolingFunctor<DeviceType::OPENCL, float>;
+template
+struct PoolingFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -6,24 +6,33 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/resize_bilinear.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-template <>
+template <typename T>
-void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
+void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *input, const Tensor *resize_dims, Tensor *output) {
  const index_t batch = input->dim(0);
-  const index_t channels = input->dim(1);
+  const index_t in_height = input->dim(1);
-  const index_t in_height = input->dim(2);
+  const index_t in_width = input->dim(2);
-  const index_t in_width = input->dim(3);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
  index_t out_height;
  index_t out_width;
  GetOutputSize(resize_dims, &out_height, &out_width);
  MACE_CHECK(out_height > 0 && out_width > 0);
-  std::vector<index_t> out_shape {batch, channels, out_height, out_width};
+  std::vector<index_t> output_shape {batch, out_height, out_width, channels};
-  output->Resize(out_shape);
+  if (input->is_image()) {
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+    output->ResizeImage(output_shape, output_image_shape);
+  } else {
+    output->Resize(output_shape);
+  }
  float height_scale =
      CalculateResizeScale(in_height, out_height, align_corners_);
@@ -31,29 +40,37 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto dt = DataTypeToEnum<T>::value;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
  auto rb_kernel  = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
  uint32_t idx = 0;
-  rb_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
-  rb_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
  rb_kernel.setArg(idx++, height_scale);
  rb_kernel.setArg(idx++, width_scale);
-  rb_kernel.setArg(idx++, static_cast<int>(in_height));
+  rb_kernel.setArg(idx++, static_cast<int32_t>(in_height));
-  rb_kernel.setArg(idx++, static_cast<int>(in_width));
+  rb_kernel.setArg(idx++, static_cast<int32_t>(in_width));
+  rb_kernel.setArg(idx++, static_cast<int32_t>(out_height));
  auto command_queue = runtime->command_queue();
  cl_int error = command_queue.enqueueNDRangeKernel(
      rb_kernel, cl::NullRange,
-      cl::NDRange(static_cast<int>(batch * channels),
+      cl::NDRange(static_cast<int32_t>(channel_blocks),
-                  static_cast<int>(out_height), static_cast<int>(out_width)),
+                  static_cast<int32_t>(out_width),
-      // TODO (heliangliang) tuning and fix when kwg_size < devisor
+                  static_cast<int32_t>(out_height * batch)),
-      cl::NDRange(1, 16, kwg_size / 16),
+      // TODO tuning
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+      cl::NDRange(1, static_cast<int32_t>(out_width > kwg_size ? kwg_size : out_width), 1),
+      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }
+template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
+template struct ResizeBilinearFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opecl.cc
@@ -20,7 +20,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
                                                                Tensor *batch_tensor) {
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(space_tensor->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(space_tensor->dtype()));
  auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);
  uint32_t idx = 0;

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -18,36 +18,66 @@ enum PoolingType {
 namespace kernels {
-template <DeviceType D, typename T>
+struct PoolingFunctorBase {
-struct PoolingFunctor {
+  PoolingFunctorBase(const PoolingType pooling_type,
-  PoolingFunctor(const PoolingType pooling_type,
+                     const int *kernels,
-                 const int *kernels,
+                     const int *strides,
-                 const int *strides,
+                     const Padding padding,
-                 const Padding padding,
+                     const int *dilations)
-                 const int *dilations)
      : pooling_type_(pooling_type),
        kernels_(kernels),
        strides_(strides),
        padding_(padding),
        dilations_(dilations) {}
+  const PoolingType pooling_type_;
+  const int *kernels_;
+  const int *strides_;
+  const Padding padding_;
+  const int *dilations_;
+};
+template<DeviceType D, typename T>
+struct PoolingFunctor : PoolingFunctorBase {
+  PoolingFunctor(const PoolingType pooling_type,
+                 const int *kernels,
+                 const int *strides,
+                 const Padding padding,
+                 const int *dilations)
+      : PoolingFunctorBase(pooling_type, kernels,
+                           strides, padding,
+                           dilations) {}
  void operator()(const Tensor *input_tensor,
                  Tensor *output_tensor) {
+    std::vector<index_t> output_shape(4);
+    std::vector<int> paddings(2);
+    std::vector<index_t> filter_shape = {
+        kernels_[0], kernels_[1],
+        input_tensor->dim(3), input_tensor->dim(3)
+    };
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input_tensor->shape().data(), filter_shape.data(),
+        dilations_, strides_, this->padding_,
+        output_shape.data(), paddings.data());
+    output_tensor->Resize(output_shape);
    Tensor::MappingGuard in_guard(input_tensor);
    Tensor::MappingGuard out_guard(output_tensor);
    const T *input = input_tensor->data<T>();
    T *output = output_tensor->mutable_data<T>();
    const index_t *input_shape = input_tensor->shape().data();
-    const index_t *output_shape = output_tensor->shape().data();
    index_t batch = output_shape[0];
-    index_t channels = output_shape[1];
+    index_t height = output_shape[1];
-    index_t height = output_shape[2];
+    index_t width = output_shape[2];
-    index_t width = output_shape[3];
+    index_t channels = output_shape[3];
    index_t out_image_size = height * width;
-    index_t input_channels = input_shape[1];
+    index_t input_height = input_shape[1];
-    index_t input_height = input_shape[2];
+    index_t input_width = input_shape[2];
-    index_t input_width = input_shape[3];
+    index_t input_channels = input_shape[3];
    index_t in_image_size = input_height * input_width;
    int kernel_h = kernels_[0];
@@ -59,11 +89,6 @@ struct PoolingFunctor {
    int dilation_h = dilations_[0];
    int dilation_w = dilations_[1];
-    int paddings[2];
-    std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
-                                         kernels_[0], kernels_[1]};
-    kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
-                            strides_, this->padding_, paddings);
    // The left-upper most offset of the padded input
    int padded_h_start = 0 - paddings[0] / 2;
    int padded_w_start = 0 - paddings[1] / 2;
@@ -71,25 +96,24 @@ struct PoolingFunctor {
    if (pooling_type_ == MAX) {
 #pragma omp parallel for collapse(2)
      for (int b = 0; b < batch; ++b) {
-        for (int c = 0; c < channels; ++c) {
+        for (int h = 0; h < height; ++h) {
-          index_t out_offset = (b * channels + c) * out_image_size;
+          for (int w = 0; w < width; ++w) {
-          index_t in_offset = (b * input_channels + c) * in_image_size;
+            for (int c = 0; c < channels; ++c) {
-          for (int h = 0; h < height; ++h) {
+              index_t in_offset = b * in_image_size * input_channels + c;
-            for (int w = 0; w < width; ++w) {
+              T res = std::numeric_limits<T>::lowest();
-              T max = std::numeric_limits<T>::lowest();
              for (int kh = 0; kh < kernel_h; ++kh) {
                for (int kw = 0; kw < kernel_w; ++kw) {
                  int inh = padded_h_start + h * stride_h + dilation_h * kh;
                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
                  if (inh >= 0 && inh < input_height && inw >= 0 &&
                      inw < input_width) {
-                    index_t input_offset = in_offset + inh * input_width + inw;
+                    index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
-                    max = std::max(max, input[input_offset]);
+                    res = std::max(res, input[input_offset]);
                  }
                }
              }
-              output[out_offset] = max;
+              *output = res;
-              out_offset += 1;
+              output++;
            }
          }
        }
@@ -97,11 +121,10 @@ struct PoolingFunctor {
    } else if (pooling_type_ == AVG) {
 #pragma omp parallel for collapse(2)
      for (int b = 0; b < batch; ++b) {
-        for (int c = 0; c < channels; ++c) {
+        for (int h = 0; h < height; ++h) {
-          index_t out_offset = (b * channels + c) * out_image_size;
+          for (int w = 0; w < width; ++w) {
-          index_t in_offset = (b * input_channels + c) * in_image_size;
+            for (int c = 0; c < channels; ++c) {
-          for (int h = 0; h < height; ++h) {
+              index_t in_offset = b * in_image_size * input_channels + c;
-            for (int w = 0; w < width; ++w) {
              T sum = 0;
              int block_size = 0;
              for (int kh = 0; kh < kernel_h; ++kh) {
@@ -110,14 +133,14 @@ struct PoolingFunctor {
                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
                  if (inh >= 0 && inh < input_height && inw >= 0 &&
                      inw < input_width) {
-                    index_t input_offset = in_offset + inh * input_width + inw;
+                    index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
                    sum += input[input_offset];
                    block_size += 1;
                  }
                }
              }
-              output[out_offset] = sum / block_size;
+              *output = sum / block_size;
-              out_offset += 1;
+              output++;
            }
          }
        }
@@ -125,22 +148,26 @@ struct PoolingFunctor {
    }
  }
-  const PoolingType pooling_type_;
-  const int *kernels_;
-  const int *strides_;
-  const Padding padding_;
-  const int *dilations_;
 };
-template <>
+template<>
 void PoolingFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *input_tensor,
    Tensor *output_tensor);
-template <>
+template<typename T>
-void PoolingFunctor<DeviceType::OPENCL, float>::operator()(
+struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
-    const Tensor *input_tensor,
+  PoolingFunctor(const PoolingType pooling_type,
-    Tensor *output_tensor);
+                 const int *kernels,
+                 const int *strides,
+                 const Padding padding,
+                 const int *dilations)
+      : PoolingFunctorBase(pooling_type, kernels,
+                           strides, padding,
+                           dilations) {}
+  void operator()(const Tensor *input_tensor,
+                  Tensor *output_tensor);
+};
 }  //  namespace kernels
 }  //  namespace mace

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -61,63 +61,90 @@ void ResizeImage(const T *images,
                 const index_t channels,
                 const std::vector<CachedInterpolation> &xs_vec,
                 const std::vector<CachedInterpolation> &ys,
-                 float *output) {
+                 T *output) {
-  const index_t in_channel_size = in_height * in_width;
+  const index_t in_batch_num_values = channels * in_height * in_width;
-  const index_t in_batch_num_values = channels * in_channel_size;
+  const index_t out_batch_num_values = channels * out_height * out_width;
-  const index_t out_channel_size = out_height * out_width;
-  const index_t out_batch_num_values = channels * out_channel_size;
  const CachedInterpolation *xs = xs_vec.data();
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for
  for (index_t b = 0; b < batch_size; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
+    const T *batch_input_ptr = images + in_batch_num_values * b;;
-      const T *input_ptr =
+    T *batch_output_ptr = output + out_batch_num_values * b;
-          images + in_batch_num_values * b + in_channel_size * c;
-      float *output_ptr =
+    for (index_t y = 0; y < out_height; ++y) {
-          output + out_batch_num_values * b + out_channel_size * c;
+      const T *y_lower_input_ptr =
-      for (index_t y = 0; y < out_height; ++y) {
+        batch_input_ptr + ys[y].lower * in_width * channels;
-        const T *ys_input_lower_ptr = input_ptr + ys[y].lower * in_width;
+      const T *y_upper_input_ptr =
-        const T *ys_input_upper_ptr = input_ptr + ys[y].upper * in_width;
+        batch_input_ptr + ys[y].upper * in_width * channels;
-        const float ys_lerp = ys[y].lerp;
+      T *y_output_ptr = batch_output_ptr + y * out_width * channels;
-        for (index_t x = 0; x < out_width; ++x) {
+      const float ys_lerp = ys[y].lerp;
-          auto xs_lower = xs[x].lower;
-          auto xs_upper = xs[x].upper;
+      for (index_t x = 0; x < out_width; ++x) {
-          auto xs_lerp = xs[x].lerp;
+        const float xs_lerp = xs[x].lerp;
+        const T *top_left_ptr = y_lower_input_ptr + xs[x].lower * channels;
-          const float top_left = ys_input_lower_ptr[xs_lower];
+        const T *top_right_ptr = y_lower_input_ptr + xs[x].upper * channels;
-          const float top_right = ys_input_lower_ptr[xs_upper];
+        const T *bottom_left_ptr = y_upper_input_ptr + xs[x].lower * channels;
-          const float bottom_left = ys_input_upper_ptr[xs_lower];
+        const T *bottom_right_ptr = y_upper_input_ptr + xs[x].upper * channels;
-          const float bottom_right = ys_input_upper_ptr[xs_upper];
+        T *output_ptr = y_output_ptr + x * channels;
-          output_ptr[x] = ComputeLerp(top_left, top_right, bottom_left,
+        for (index_t c = 0; c < channels; ++c) {
-                                      bottom_right, xs_lerp, ys_lerp);
+          const T top_left = top_left_ptr[c];
+          const T top_right = top_right_ptr[c];
+          const T bottom_left = bottom_left_ptr[c];
+          const T bottom_right = bottom_right_ptr[c];
+          output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left,
+              bottom_right, xs_lerp, ys_lerp);
        }
-        output_ptr += out_width;
      }
    }
  }
 }
 }
+struct ResizeBilinearFunctorBase {
+  ResizeBilinearFunctorBase(const std::vector<index_t> &size,
+                            bool align_corners)
+      : align_corners_(align_corners), size_(size) {}
+ protected:
+  void GetOutputSize(const Tensor *resize_dims,
+                     index_t *out_height,
+                     index_t *out_width) {
+    if (size_[0] < 0 || size_[1] < 0) {
+      MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1);
+      Tensor::MappingGuard resize_dims_mapper(resize_dims);
+      auto dims_data = resize_dims->data<int32_t>();
+      *out_height = dims_data[0];
+      *out_width = dims_data[1];
+    } else {
+      *out_height = size_[0];
+      *out_width = size_[1];
+    }
+  }
+  bool align_corners_;
+  std::vector<index_t> size_;
+};
 template <DeviceType D, typename T>
-class ResizeBilinearFunctor {
+struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
- public:
  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
-      : align_corners_(align_corners), size_(size) {}
+      : ResizeBilinearFunctorBase(size, align_corners) {}
  void operator()(const Tensor *input,
                  const Tensor *resize_dims,
                  Tensor *output) {
    const index_t batch = input->dim(0);
-    const index_t channels = input->dim(1);
+    const index_t in_height = input->dim(1);
-    const index_t in_height = input->dim(2);
+    const index_t in_width = input->dim(2);
-    const index_t in_width = input->dim(3);
+    const index_t channels = input->dim(3);
    index_t out_height;
    index_t out_width;
    GetOutputSize(resize_dims, &out_height, &out_width);
    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> out_shape{batch, channels, out_height, out_width};
+    std::vector<index_t> out_shape{batch, out_height, out_width, channels};
    output->Resize(out_shape);
    Tensor::MappingGuard input_mapper(input);
@@ -146,32 +173,18 @@ class ResizeBilinearFunctor {
    ResizeImage(input_data, batch, in_height, in_width, out_height, out_width,
                channels, xs, ys, output_data);
  }
+};
- protected:
+template<typename T>
-  void GetOutputSize(const Tensor *resize_dims,
+struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase {
-                     index_t *out_height,
+  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
-                     index_t *out_width) {
+      : ResizeBilinearFunctorBase(size, align_corners) {}
-    if (size_[0] < 0 || size_[1] < 0) {
-      MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1);
-      Tensor::MappingGuard resize_dims_mapper(resize_dims);
-      auto dims_data = resize_dims->data<int32_t>();
-      *out_height = dims_data[0];
-      *out_width = dims_data[1];
-    } else {
-      *out_height = size_[0];
-      *out_width = size_[1];
-    }
-  }
- private:
+  void operator()(const Tensor *input,
-  bool align_corners_;
+                  const Tensor *resize_dims,
-  std::vector<index_t> size_;
+                  Tensor *output);
 };
-template <>
-void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
-    const Tensor *input, const Tensor *resize_dims, Tensor *output);
 }  // namespace kernels
 }  // namespace mace

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -22,4 +22,10 @@ def if_android_arm64(a):
  return select({
      "//mace:android_arm64": a,
      "//conditions:default": [],
  })
\ No newline at end of file
+def if_profiling(a):
+  return select({
+      "//mace:is_profiling": a,
+      "//conditions:default": [],
+  })
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -6,12 +6,26 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(AddN, AddNOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      AddNOp<DeviceType::CPU, float>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(AddN, AddNOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       AddNOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
-REGISTER_OPENCL_OPERATOR(AddN, AddNOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         AddNOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         AddNOp<DeviceType::OPENCL, half>);
 }  //  namespace mace
--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -10,7 +10,7 @@
 namespace mace {
-template<DeviceType D, class T>
+template <DeviceType D, class T>
 class AddNOp : public Operator<D, T> {
 public:
  AddNOp(const OperatorDef &operator_def, Workspace *ws)
@@ -18,7 +18,6 @@ class AddNOp : public Operator<D, T> {
  bool Run() override {
    Tensor *output_tensor = this->outputs_[0];
-    output_tensor->ResizeLike(this->inputs_[0]);
    int n = this->inputs_.size();
    vector<const Tensor *> inputs(n, nullptr);
    for (int i = 0; i < n; ++i) {

--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -9,47 +9,69 @@
 namespace mace {
 template <DeviceType D, typename T>
-static void AddNBenchmark(int iters, int n, int size) {
+static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
  mace::testing::StopTiming();
  OpsTestNet net;
-  OpDefBuilder op_def_builder("AddN", "AddNBM");
+  // Add input data
-  for (int i = 0; i < n; ++i) {
+  for (int i = 0; i < inputs; ++i) {
-    op_def_builder.Input(internal::MakeString("Input", i).c_str());
+    net.AddRandomInput<D, float>(
+        internal::MakeString("Input", i).c_str(), {n, h, w, c});
  }
-  op_def_builder.Output("Output").Finalize(net.NewOperatorDef());
-  // Add input data
+  if (D == DeviceType::OPENCL) {
-  for (int i = 0; i < n; ++i) {
+    for (int i = 0; i < inputs; ++i) {
-    net.AddRandomInput<DeviceType::CPU, float>(internal::MakeString("Input", i).c_str(), {size});
+      BufferToImage<D, T>(net, internal::MakeString("Input", i).c_str(),
+                          internal::MakeString("InputImage", i).c_str(),
+                          kernels::BufferType::IN_OUT);
+    }
+    OpDefBuilder op_def_builder("AddN", "AddNBM");
+    for (int i = 0; i < inputs; ++i) {
+      op_def_builder.Input(internal::MakeString("InputImage", i).c_str());
+    }
+    op_def_builder.Output("OutputImage")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder op_def_builder("AddN", "AddNBM");
+    for (int i = 0; i < inputs; ++i) {
+      op_def_builder.Input(internal::MakeString("Input", i).c_str());
+    }
+    op_def_builder.Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
  }
  // Warm-up
  for (int i = 0; i < 5; ++i) {
    net.RunOp(D);
+    net.Sync();
  }
  mace::testing::StartTiming();
  while (iters--) {
    net.RunOp(D);
+    net.Sync();
  }
 }
-#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE)                        \
+#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE)                     \
-  static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE(int iters) { \
+  static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
-    const int64_t tot = static_cast<int64_t>(iters) * N * SIZE;     \
+      int iters) {                                                          \
-    mace::testing::ItemsProcessed(tot);                             \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;        \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));             \
+    mace::testing::ItemsProcessed(tot);                                     \
-    AddNBenchmark<DEVICE, TYPE>(iters, N, SIZE);                    \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                     \
-  }                                                                 \
+    AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C);                 \
-  BENCHMARK(BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE)
+  }                                                                         \
+  BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
-#define BM_ADDN(N, SIZE, TYPE)       \
-  BM_ADDN_MACRO(N, SIZE, TYPE, CPU); \
+#define BM_ADDN(INPUTS, N, H, W, C, TYPE)       \
-  BM_ADDN_MACRO(N, SIZE, TYPE, NEON);
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);
-BM_ADDN(10, 1000, float);
-BM_ADDN(10, 10000, float);
+BM_ADDN(2, 1, 240, 240, 256, float);
-BM_ADDN(100, 1000, float);
+// BM_ADDN(2, 1, 240, 240, 256, half);
-BM_ADDN(100, 10000, float);
+BM_ADDN(4, 1, 240, 240, 256, float);
-}  //  namespace mace
+// BM_ADDN(4, 1, 240, 240, 256, half);
\ No newline at end of file
+}  //  namespace mace
--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -9,7 +9,7 @@ namespace mace {
 class AddnOpTest : public OpsTestBase {};
-template<DeviceType D>
+template <DeviceType D>
 void SimpleAdd2() {
  // Construct graph
  OpsTestNet net;
@@ -20,30 +20,26 @@ void SimpleAdd2() {
      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
-  net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
  // Run
  net.RunOp(D);
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {2, 4, 6, 8, 10, 12});
+  auto expected = CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
-TEST_F(AddnOpTest, CPUSimpleAdd2) {
+TEST_F(AddnOpTest, CPUSimpleAdd2) { SimpleAdd2<DeviceType::CPU>(); }
-  SimpleAdd2<DeviceType::CPU>();
-}
-TEST_F(AddnOpTest, NEONSimpleAdd2) {
+/*
-  SimpleAdd2<DeviceType::NEON>();
+TEST_F(AddnOpTest, NEONSimpleAdd2) { SimpleAdd2<DeviceType::NEON>(); }
-}
-TEST_F(AddnOpTest, OPENCLSimpleAdd2) {
+TEST_F(AddnOpTest, OPENCLSimpleAdd2) { SimpleAdd2<DeviceType::OPENCL>(); }
-  SimpleAdd2<DeviceType::OPENCL>();
+*/
-}
-template<DeviceType D>
+template <DeviceType D>
 void SimpleAdd3() {
  // Construct graph
  OpsTestNet net;
@@ -55,62 +51,80 @@ void SimpleAdd3() {
      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
-  net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
-  net.AddInputFromArray<D, float>("Input3", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
  // Run
  net.RunOp(D);
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {3, 6, 9, 12, 15, 18});
+  auto expected = CreateTensor<float>({1, 2, 3, 1}, {3, 6, 9, 12, 15, 18});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
-TEST_F(AddnOpTest, CPUSimpleAdd3) {
+TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
-  SimpleAdd3<DeviceType::CPU>();
-}
-TEST_F(AddnOpTest, NEONSimpleAdd3) {
+/*
-  SimpleAdd3<DeviceType::NEON>();
+TEST_F(AddnOpTest, NEONSimpleAdd3) { SimpleAdd3<DeviceType::NEON>(); }
-}
+*/
-template<DeviceType D>
+template <DeviceType D>
 void RandomTest() {
-  // Construct graph
+  testing::internal::LogToStderr();
-  OpsTestNet net;
+  srand(time(NULL));
-  OpDefBuilder("AddN", "AddNTest")
-      .Input("Input1")
+  for (int round = 0; round < 10; ++round) {
-      .Input("Input2")
+    // generate random input
-      .Output("Output")
+    index_t n = 1 + (rand() % 5);
-      .Finalize(net.NewOperatorDef());
+    index_t h = 1 + (rand() % 100);
+    index_t w = 1 + (rand() % 100);
-  // Add input data
+    index_t c = 1 + (rand() % 32);
-  net.AddRandomInput<D, float>("Input1", {1, 2, 3, 4});
+    int input_num = 2 + rand() % 3;
-  net.AddRandomInput<D, float>("Input2", {1, 2, 3, 4});
+    // Construct graph
+    OpsTestNet net;
-  // Check
+    auto op_def = OpDefBuilder("AddN", "AddNTest");
-  net.RunOp(D);
+    for (int i = 0; i < input_num; ++i) {
+      op_def.Input("Input" + ToString(i));
-  Tensor result;
+    }
-  result.Copy(*net.GetOutput("Output"));
+    op_def.Output("Output").Finalize(net.NewOperatorDef());
-  // Run
+    // Add input data
-  net.RunOp();
+    for (int i = 0; i < input_num; ++i) {
+      net.AddRandomInput<D, float>("Input" + ToString(i), {n, h, w, c});
-  ExpectTensorNear<float>(*net.GetOutput("Output"), result, 1e-5);
+    }
-}
+    // run on cpu
-TEST_F(AddnOpTest, CPURandom) {
+    net.RunOp();
-  RandomTest<DeviceType::CPU>();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+    // run on gpu
+    for (int i = 0; i < input_num; ++i) {
+      BufferToImage<D, half>(net, "Input" + ToString(i),
+                             "InputImage" + ToString(i),
+                             kernels::BufferType::IN_OUT);
+    }
+    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
+    for (int i = 0; i < input_num; ++i) {
+      op_def_cl.Input("InputImage" + ToString(i));
+    }
+    op_def_cl.Output("OutputImage")
+        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
+                            kernels::BufferType::IN_OUT);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.1);
+  }
 }
-TEST_F(AddnOpTest, NEONRandom) {
+TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
-  RandomTest<DeviceType::NEON>();
-}
-TEST_F(AddnOpTest, OPENCLRandom) {
-  RandomTest<DeviceType::OPENCL>();
-}
 }  // namespace mace
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -6,12 +6,26 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(BatchNorm, BatchNormOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("BatchNorm")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      BatchNormOp<DeviceType::CPU, float>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(BatchNorm, BatchNormOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("BatchNorm")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       BatchNormOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
-REGISTER_OPENCL_OPERATOR(BatchNorm, BatchNormOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchNorm")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         BatchNormOp<DeviceType::OPENCL, float>);
-}  //  namespace mace
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchNorm")
\ No newline at end of file
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         BatchNormOp<DeviceType::OPENCL, half>);
+}  //  namespace mace
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -13,28 +13,45 @@ static void BatchNorm(
    int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();
-  if ( D == OPENCL )
-    OpenCLRuntime::EnableProfiling();
  OpsTestNet net;
-  OpDefBuilder("BatchNorm", "BatchNormBM")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Input("Mean")
-      .Input("Var")
-      .Input("Epsilon")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
+  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  net.AddRandomInput<D, T>("Scale", {channels});
  net.AddRandomInput<D, T>("Offset", {channels});
  net.AddRandomInput<D, T>("Mean", {channels});
  net.AddRandomInput<D, T>("Var", {channels}, true);
  net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("BatchNorm", "BatchNormBM")
+        .Input("InputImage")
+        .Input("ScaleImage")
+        .Input("OffsetImage")
+        .Input("MeanImage")
+        .Input("VarImage")
+        .Input("Epsilon")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  }
+  else {
+    OpDefBuilder("BatchNorm", "BatchNormBM")
+        .Input("Input")
+        .Input("Scale")
+        .Input("Offset")
+        .Input("Mean")
+        .Input("Var")
+        .Input("Epsilon")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  }
  // tuning
  setenv("MACE_TUNING", "1", 1);
  net.RunOp(D);

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -11,20 +11,10 @@ class BatchNormOpTest : public OpsTestBase {};
 template <DeviceType D>
 void Simple() {
-  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Input("Mean")
-      .Input("Var")
-      .Input("Epsilon")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddInputFromArray<D, float>("Input", {1, 1, 6, 2},
+  net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
                               {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
  net.AddInputFromArray<D, float>("Scale", {1}, {4.0f});
  net.AddInputFromArray<D, float>("Offset", {1}, {2.0});
@@ -32,12 +22,44 @@ void Simple() {
  net.AddInputFromArray<D, float>("Var", {1}, {11.67f});
  net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
-  // Run
+  if (D == DeviceType::OPENCL) {
-  net.RunOp(D);
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("BatchNorm", "BatchNormTest")
+        .Input("InputImage")
+        .Input("ScaleImage")
+        .Input("OffsetImage")
+        .Input("MeanImage")
+        .Input("VarImage")
+        .Input("Epsilon")
+        .Output("OutputImage")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("BatchNorm", "BatchNormTest")
+        .Input("Input")
+        .Input("Scale")
+        .Input("Offset")
+        .Input("Mean")
+        .Input("Var")
+        .Input("Epsilon")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
  // Check
  auto expected =
-      CreateTensor<float>({1, 1, 6, 2}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
+      CreateTensor<float>({1, 6, 2, 1}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
                                         3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
@@ -47,14 +69,17 @@ TEST_F(BatchNormOpTest, SimpleCPU) {
  Simple<DeviceType::CPU>();
 }
+/*
 TEST_F(BatchNormOpTest, SimpleNEON) {
  Simple<DeviceType::NEON>();
 }
+*/
 TEST_F(BatchNormOpTest, SimpleOPENCL) {
  Simple<DeviceType::OPENCL>();
 }
+/*
 TEST_F(BatchNormOpTest, SimpleRandomNeon) {
  srand(time(NULL));
@@ -136,6 +161,7 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) {
  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
 }
+*/
 TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  srand(time(NULL));
@@ -145,6 +171,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  index_t channels = 3 + rand() % 50;
  index_t height = 64;
  index_t width = 64;
  // Construct graph
  auto &net = test_net();
  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -158,29 +185,48 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, channels, height, width});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, height, width, channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
  net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
-  // tuning
+  // run cpu
+  net.RunOp();
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+  // Run on opencl
+  BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+      .Input("InputImage")
+      .Input("ScaleImage")
+      .Input("OffsetImage")
+      .Input("MeanImage")
+      .Input("VarImage")
+      .Input("Epsilon")
+      .Output("OutputImage")
+      .Finalize(net.NewOperatorDef());
+  // Tuning
  setenv("MACE_TUNING", "1", 1);
  net.RunOp(DeviceType::OPENCL);
  unsetenv("MACE_TUNING");
  // Run on opencl
  net.RunOp(DeviceType::OPENCL);
+  net.Sync();
-  // Check
+  ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
-  Tensor expected;
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
-  expected.Copy(*net.GetOutput("Output"));
-  // run cpu
-  net.RunOp();
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
 }
 TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
@@ -191,6 +237,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  index_t channels = 3 + rand() % 50;
  index_t height = 103;
  index_t width = 113;
  // Construct graph
  auto &net = test_net();
  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -204,13 +251,38 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, channels, height, width});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, height, width, channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
  net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
+  // run cpu
+  net.RunOp();
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+  // Run on opencl
+  BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+      .Input("InputImage")
+      .Input("ScaleImage")
+      .Input("OffsetImage")
+      .Input("MeanImage")
+      .Input("VarImage")
+      .Input("Epsilon")
+      .Output("OutputImage")
+      .Finalize(net.NewOperatorDef());
  // tuning
  setenv("MACE_TUNING", "1", 1);
  net.RunOp(DeviceType::OPENCL);
@@ -220,14 +292,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  net.RunOp(DeviceType::OPENCL);
  net.Sync();
-  // Check
+  ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
-  Tensor expected;
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
-  expected.Copy(*net.GetOutput("Output"));
-  // run cpu
-  net.RunOp();
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
 }
 }
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -6,6 +6,9 @@
 namespace mace {
-REGISTER_OPENCL_OPERATOR(BatchToSpaceND, BatchToSpaceNDOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchToSpaceND")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         BatchToSpaceNDOp<DeviceType::OPENCL, float>);
 }  //  namespace mace
--- a/mace/ops/buffer_to_image.cc
+++ b/mace/ops/buffer_to_image.cc
@@ -6,6 +6,14 @@
 namespace mace {
-REGISTER_OPENCL_OPERATOR(BufferToImage, BufferToImageOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         BufferToImageOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         BufferToImageOp<DeviceType::OPENCL, half>);
 }  //  namespace mace
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
      .Input("Input")
      .Output("B2IOutput")
      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
      .Finalize(net.NewOperatorDef());
  // Add input data
@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
      .Input("B2IOutput")
      .Output("I2BOutput")
      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
      .Finalize(net.NewOperatorDef());
  // Run
@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) {
  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
 }
+TEST(BufferToImageTest, ArgHalfSmall) {
+  TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
+}
 TEST(BufferToImageTest, ArgMedia) {
  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
 }
@@ -91,3 +97,36 @@ TEST(BufferToImageTest, Filter3x3Meida) {
 TEST(BufferToImageTest, Filter3x3Large) {
  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256});
 }
+template<DeviceType D, typename T>
+void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
+  OpsTestNet net;
+  OpDefBuilder("BufferToImage", "BufferToImageTest")
+      .Input("Input")
+      .Output("B2IOutput")
+      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
+      .Finalize(net.NewOperatorDef());
+  // Add input data
+  net.AddRandomInput<D, float>("Input", input_shape);
+  // Run
+  net.RunOp(D);
+  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
+      .Input("B2IOutput")
+      .Output("I2BOutput")
+      .AddIntArg("buffer_type", type)
+      .Finalize(net.NewOperatorDef());
+  // Run
+  net.RunOp(D);
+  // Check
+  ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-3);
+}
+TEST(BufferToImageTest, ArgFloatToHalfSmall) {
+  TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
+}
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -6,6 +6,9 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(ChannelShuffle, ChannelShuffleOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("ChannelShuffle")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      ChannelShuffleOp<DeviceType::CPU, float>);
 }  // namespace mace
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -6,6 +6,9 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(Concat, ConcatOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Concat")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      ConcatOp<DeviceType::CPU, float>);
 }  // namespace mace
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -6,12 +6,31 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(Conv2D, Conv2dOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      Conv2dOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
+                          .TypeConstraint<half>("T")
+                          .Build(),
+                      Conv2dOp<DeviceType::CPU, half>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(Conv2D, Conv2dOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       Conv2dOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
-REGISTER_OPENCL_OPERATOR(Conv2D, Conv2dOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         Conv2dOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         Conv2dOp<DeviceType::OPENCL, half>);
 }  // namespace mace
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -33,9 +33,9 @@ static void Conv2d(int iters,
  net.AddRandomInput<D, float>("Bias", {output_channels});
  if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
        .Input("FilterImage")
@@ -44,6 +44,7 @@ static void Conv2d(int iters,
        .AddIntsArg("strides", {stride, stride})
        .AddIntArg("padding", padding)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
  } else {
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -54,6 +55,7 @@ static void Conv2d(int iters,
        .AddIntsArg("strides", {stride, stride})
        .AddIntArg("padding", padding)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
  }
@@ -91,39 +93,39 @@ static void Conv2d(int iters,
  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
 // ICNet
-BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float);
+BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
-BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float);
 // SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
-BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float);
+BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
 // SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
-BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float);
+BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
+BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
-BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, VALID, 32, float);
+BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
 // Test RGB <-> YUV
-BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
-BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
+//
-BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments
+//BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments
-BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
-BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
+//BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
-BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
-BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
+//BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
-BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
+//BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
-BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
+//BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
-BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
+//BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
-BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
 }  //  namespace mace
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -6,15 +6,21 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(DepthwiseConv2d,
+REGISTER_CPU_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                      DepthwiseConv2dOp<DeviceType::CPU, float>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(DepthwiseConv2d,
+REGISTER_NEON_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                       DepthwiseConv2dOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
-REGISTER_OPENCL_OPERATOR(DepthwiseConv2d,
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                         DepthwiseConv2dOp<DeviceType::OPENCL, float>);
 }  // namespace mace
--- a/mace/ops/fused_conv_2d.cc
+++ b/mace/ops/fused_conv_2d.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#include "mace/ops/fused_conv_2d.h"
+namespace mace {
+REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      FusedConv2dOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D")
+                          .TypeConstraint<half>("T")
+                          .Build(),
+                      FusedConv2dOp<DeviceType::CPU, half>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         FusedConv2dOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         FusedConv2dOp<DeviceType::OPENCL, half>);
+}  // namespace mace
--- a/mace/ops/fused_conv_2d.h
+++ b/mace/ops/fused_conv_2d.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#ifndef MACE_OPS_FUSED_CONV_2D_H_
+#define MACE_OPS_FUSED_CONV_2D_H_
+#include <memory>
+#include "mace/core/operator.h"
+#include "mace/kernels/fused_conv_2d.h"
+#include "mace/ops/conv_pool_2d_base.h"
+namespace mace {
+template <DeviceType D, typename T>
+class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
+ public:
+  FusedConv2dOp(const OperatorDef &op_def, Workspace *ws)
+      : ConvPool2dOpBase<D, T>(op_def, ws),
+        functor_(this->strides_.data(), this->padding_,
+                 this->dilations_.data()) {
+  }
+  bool Run() override {
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *filter = this->Input(FILTER);
+    const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr;
+    Tensor *output = this->Output(OUTPUT);
+    functor_(input, filter, bias, output);
+    return true;
+  }
+ private:
+  kernels::FusedConv2dFunctor<D, T> functor_;
+ protected:
+  OP_INPUT_TAGS(INPUT, FILTER, BIAS);
+  OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace mace
+#endif  // MACE_OPS_FUSED_CONV_2D_H_
--- a/mace/ops/fused_conv_2d_test.cc
+++ b/mace/ops/fused_conv_2d_test.cc
--- a/mace/ops/global_avg_pooling.cc
+++ b/mace/ops/global_avg_pooling.cc
@@ -6,11 +6,15 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(GlobalAvgPooling,
+REGISTER_CPU_OPERATOR(OpKeyBuilder("GlobalAvgPooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                      GlobalAvgPoolingOp<DeviceType::CPU, float>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(GlobalAvgPooling,
+REGISTER_NEON_OPERATOR(OpKeyBuilder("GlobalAvgPooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                       GlobalAvgPoolingOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON

--- a/mace/ops/image_to_buffer.cc
+++ b/mace/ops/image_to_buffer.cc
@@ -6,6 +6,14 @@
 namespace mace {
-REGISTER_OPENCL_OPERATOR(ImageToBuffer, ImageToBufferOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ImageToBuffer")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         ImageToBufferOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ImageToBuffer")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         ImageToBufferOp<DeviceType::OPENCL, half>);
 }  //  namespace mace
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -13,6 +13,7 @@
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 namespace mace {
@@ -209,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
                                std::vector<T> &res) {
  std::random_device rd;
  std::mt19937 gen(rd());
-  std::normal_distribution<T> nd(0, 1);
+  std::normal_distribution<float> nd(0, 1);
  index_t size = std::accumulate(shape.begin(), shape.end(), 1,
                                 std::multiplies<index_t>());
  res.resize(size);
-  std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
+  if (DataTypeToEnum<T>::value == DT_HALF) {
+    std::generate(res.begin(), res.end(), [&gen, &nd] { return half_float::half_cast<half>(nd(gen)); });
+  } else {
+    std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
+  }
 }
 template <typename T>
@@ -289,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) {
  EXPECT_DOUBLE_EQ(a, b);
 }
-inline void AssertSameTypeDims(const Tensor &x, const Tensor &y) {
+inline void AssertSameDims(const Tensor &x, const Tensor &y) {
-  ASSERT_EQ(x.dtype(), y.dtype());
  ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs "
                                << "y.shape [ " << ShapeToString(y) << "]";
 }
-template <typename T, bool is_fp = is_floating_point_type<T>::value>
+template <typename EXP_TYPE, typename RES_TYPE, bool is_fp = is_floating_point_type<EXP_TYPE>::value>
 struct Expector;
 // Partial specialization for float and double.
-template <typename T>
+template <typename EXP_TYPE, typename RES_TYPE>
-struct Expector<T, true> {
+struct Expector<EXP_TYPE, RES_TYPE, true> {
-  static void Equal(const T &a, const T &b) { ExpectEqual(a, b); }
+  static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
  static void Equal(const Tensor &x, const Tensor &y) {
-    ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
+    ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
-    AssertSameTypeDims(x, y);
+    ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
+    AssertSameDims(x, y);
    Tensor::MappingGuard x_mapper(&x);
    Tensor::MappingGuard y_mapper(&y);
-    auto a = x.data<T>();
+    auto a = x.data<EXP_TYPE>();
-    auto b = y.data<T>();
+    auto b = y.data<RES_TYPE>();
    for (int i = 0; i < x.size(); ++i) {
      ExpectEqual(a(i), b(i));
    }
  }
  static void Near(const Tensor &x, const Tensor &y, const double abs_err) {
-    ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
+    ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
-    AssertSameTypeDims(x, y);
+    ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
+    AssertSameDims(x, y);
    Tensor::MappingGuard x_mapper(&x);
    Tensor::MappingGuard y_mapper(&y);
-    auto a = x.data<T>();
+    auto a = x.data<EXP_TYPE>();
-    auto b = y.data<T>();
+    auto b = y.data<RES_TYPE>();
    for (int i = 0; i < x.size(); ++i) {
      EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
                                       << " index = " << i;
@@ -334,17 +340,18 @@ template <typename T>
 void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
  static_assert(is_floating_point_type<T>::value,
                "T is not a floating point type");
-  Expector<T>::Near(x, y, abs_err);
+  Expector<T, T>::Near(x, y, abs_err);
 }
-template <typename T>
+template <typename EXP_TYPE, typename RES_TYPE>
-std::string ToString(const T &input) {
+void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
-  std::stringstream ss;
+  static_assert(is_floating_point_type<EXP_TYPE>::value
-  ss << input;
+                    && is_floating_point_type<RES_TYPE>::value,
-  return ss.str();
+                "T is not a floating point type");
+  Expector<EXP_TYPE, RES_TYPE>::Near(x, y, abs_err);
 }
-template <DeviceType D>
+template <DeviceType D, typename T>
 void BufferToImage(OpsTestNet &net,
                   const std::string &input_name,
                   const std::string &output_name,
@@ -353,6 +360,7 @@ void BufferToImage(OpsTestNet &net,
      .Input(input_name)
      .Output(output_name)
      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
  // Run
@@ -361,7 +369,7 @@ void BufferToImage(OpsTestNet &net,
  net.Sync();
 }
-template <DeviceType D>
+template <DeviceType D, typename T>
 void ImageToBuffer(OpsTestNet &net,
                   const std::string &input_name,
                   const std::string &output_name,
@@ -370,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net,
      .Input(input_name)
      .Output(output_name)
      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
  // Run

--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -6,11 +6,29 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(Pooling, PoolingOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      PoolingOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling")
+                          .TypeConstraint<half>("T")
+                          .Build(),
+                      PoolingOp<DeviceType::CPU, half>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(Pooling, PoolingOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("Pooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       PoolingOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
-REGISTER_OPENCL_OPERATOR(Pooling, PoolingOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         PoolingOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         PoolingOp<DeviceType::OPENCL, half>);
 }  //  namespace mace
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);
-    std::vector<index_t> output_shape(4);
-    std::vector<int> paddings(2);
-    std::vector<index_t> filter_shape(4);
-    // TODO(chenghui): is it kind of a hack?
-    filter_shape[0] = input->shape()[1];
-    filter_shape[1] = input->shape()[0];
-    filter_shape[2] = kernels_[0];
-    filter_shape[3] = kernels_[1];
-    kernels::CalcPaddingAndOutputSize(
-        input->shape().data(), filter_shape.data(), this->dilations_.data(),
-        this->strides_.data(), this->padding_, output_shape.data(),
-        paddings.data());
-    output->Resize(output_shape);
    functor_(input, output);
    return true;
  };

--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
--- a/mace/ops/relu.cc
+++ b/mace/ops/relu.cc
@@ -6,10 +6,16 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Relu")
+                          .TypeConstraint<float>("T")
+                          .Build(),
+                      ReluOp<DeviceType::CPU, float>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("Relu")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       ReluOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")

--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
--- a/mace/python/tools/tf_converter.py
+++ b/mace/python/tools/tf_converter.py
--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
--- a/mace/python/tools/tf_dsp_converter_lib.py
+++ b/mace/python/tools/tf_dsp_converter_lib.py
--- a/mace/python/tools/tf_ops_stats.py
+++ b/mace/python/tools/tf_ops_stats.py
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
--- a/tools/bazel-adb-run.sh
+++ b/tools/bazel-adb-run.sh