merge with master

eef80d7c · yejianwu · b7a95857 · 5c1264b3 · eef80d7c · eef80d7c
85 changed file
--- a/mace/BUILD
+++ b/mace/BUILD
@@ -23,3 +23,11 @@ config_setting(
    },
    visibility = ["//visibility:public"],
 )
+config_setting(
+    name = "is_profiling",
+    define_values = {
+        "profiling": "true",
+    },
+    visibility = ["//visibility:public"],
+)
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -7,7 +7,7 @@ package(
 licenses(["notice"])  # Apache 2.0
-load("//mace:mace.bzl", "if_android")
+load("//mace:mace.bzl", "if_android", "if_profiling")
 cc_library(
    name = "opencl_runtime",
@@ -19,7 +19,7 @@ cc_library(
        "runtime/opencl/cl2.hpp",
        "runtime/opencl/*.h",
    ]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11"] + if_profiling(["-D__ENABLE_PROFILING"]),
    deps = [
        ":logging",
        "@opencl_headers//:opencl20_headers",

--- a/mace/core/half.h
+++ b/mace/core/half.h
@@ -1098,7 +1098,7 @@ namespace half_float
 		/// Conversion constructor.
 		/// \param rhs float to convert
-		explicit half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
+		half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
 		/// Conversion to single-precision.
 		/// \return single precision value representing expression value

--- a/mace/core/opencl_allocator.cc
+++ b/mace/core/opencl_allocator.cc
@@ -13,6 +13,7 @@ namespace {
 static cl_channel_type DataTypeToCLChannelType(const DataType t) {
  switch (t) {
    case DT_HALF:
+      return CL_HALF_FLOAT;
    case DT_FLOAT:
      return CL_FLOAT;
    case DT_INT8:
@@ -53,10 +54,11 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
  cl_int error;
  cl::Image2D *cl_image =
      new cl::Image2D(OpenCLRuntime::Get()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ,
+                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                      img_format,
                      image_shape[0], image_shape[1],
                      0, nullptr, &error);
+  MACE_CHECK(error == CL_SUCCESS);
  return cl_image;
 }

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -6,6 +6,24 @@
 namespace mace {
+OpKeyBuilder::OpKeyBuilder(const char *op_name): op_name_(op_name) {}
+OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name,
+                                           const DataType allowed) {
+  type_constraint_[attr_name] = allowed;
+  return *this;
+}
+const std::string OpKeyBuilder::Build() {
+  static const std::vector<std::string> type_order = {"T"};
+  std::string key = op_name_;
+  for (auto type : type_order) {
+    key += type + "_" + DataTypeToString(type_constraint_[type]);
+  }
+  return key;
+}
 std::map<int32_t, OperatorRegistry *> *gDeviceTypeRegistry() {
  static std::map<int32_t, OperatorRegistry *> g_device_type_registry;
  return &g_device_type_registry;
@@ -33,7 +51,14 @@ unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
                                        Workspace *ws,
                                        DeviceType type) {
  OperatorRegistry *registry = gDeviceTypeRegistry()->at(type);
-  return registry->Create(operator_def.type(), operator_def, ws);
+  const int dtype = ArgumentHelper::GetSingleArgument<OperatorDef, int>(operator_def,
+                                                                        "T",
+                                                                        static_cast<int>(DT_FLOAT));
+  return registry->Create(OpKeyBuilder(operator_def.type().data())
+                              .TypeConstraint("T", static_cast<DataType>(dtype))
+                              .Build(),
+                          operator_def,
+                          ws);
 }
 OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws)

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -134,6 +134,29 @@ struct DeviceTypeRegisterer {
  }
 };
+class OpKeyBuilder {
+ public:
+  explicit OpKeyBuilder(const char *op_name);
+  OpKeyBuilder &TypeConstraint(const char *attr_name, const DataType allowed);
+  template <typename T>
+  OpKeyBuilder &TypeConstraint(const char *attr_name);
+  const std::string Build();
+ private:
+  std::string op_name_;
+  std::map<std::string, DataType> type_constraint_;
+};
+template <typename T>
+OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
+  return this->TypeConstraint(attr_name, DataTypeToEnum<T>::value);
+}
 #define MACE_REGISTER_DEVICE_TYPE(type, registry_function)         \
  namespace {                                                      \
  static DeviceTypeRegisterer MACE_ANONYMOUS_VARIABLE(DeviceType)( \

--- a/mace/core/registry.h
+++ b/mace/core/registry.h
@@ -106,10 +106,10 @@ class Registerer {
  }
 #define MACE_REGISTER_CREATOR(RegistryName, key, ...) \
-  MACE_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
+  MACE_REGISTER_TYPED_CREATOR(RegistryName, key, __VA_ARGS__)
 #define MACE_REGISTER_CLASS(RegistryName, key, ...) \
-  MACE_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
+  MACE_REGISTER_TYPED_CLASS(RegistryName, key, __VA_ARGS__)
 }  // namespace mace

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -79,14 +79,16 @@ OpenCLRuntime *OpenCLRuntime::Get() {
      return;
    }
+    cl_command_queue_properties properties = 0;
+#ifdef __ENABLE_PROFILING
+    enable_profiling_ = true;
+    profiling_ev_.reset(new cl::Event());
+    properties = CL_QUEUE_PROFILING_ENABLE;
+#endif
    // a context is like a "runtime link" to the device and platform;
    // i.e. communication is possible
    cl::Context context({gpu_device});
-    cl_command_queue_properties properties = 0;
-    if (enable_profiling_) {
-      profiling_ev_.reset(new cl::Event());
-      properties = CL_QUEUE_PROFILING_ENABLE;
-    }
    cl::CommandQueue command_queue(context, gpu_device, properties);
    instance = new OpenCLRuntime(context, gpu_device, command_queue);
@@ -104,12 +106,12 @@ cl::Event* OpenCLRuntime::GetDefaultEvent() {
 }
 cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() {
-  MACE_CHECK(enable_profiling_, "should enable profiling first.");
+  MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
  return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
 }
 cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() {
-  MACE_CHECK(enable_profiling_, "should enable profiling first.");
+  MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
  return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
 }
@@ -139,6 +141,7 @@ const std::map<std::string, std::string>
    OpenCLRuntime::program_map_ = {
  {"addn", "addn.cl"},
  {"batch_norm", "batch_norm.cl"},
+  {"conv_2d", "conv_2d.cl"},
  {"conv_2d_1x1", "conv_2d_1x1.cl"},
  {"conv_2d_3x3", "conv_2d_3x3.cl"},
  {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"},

--- a/mace/core/types.cc
+++ b/mace/core/types.cc
@@ -24,6 +24,23 @@ bool DataTypeCanUseMemcpy(DataType dt) {
  }
 }
+std::string DataTypeToString(const DataType dt) {
+  static std::map<DataType, std::string> dtype_string_map = {
+      {DT_FLOAT, "DT_FLOAT"},
+      {DT_HALF, "DT_HALF"},
+      {DT_DOUBLE, "DT_DOUBLE"},
+      {DT_UINT8, "DT_UINT8"},
+      {DT_INT8, "DT_INT8"},
+      {DT_INT32, "DT_INT32"},
+      {DT_UINT32, "DT_UINT32"},
+      {DT_UINT16, "DT_UINT16"},
+      {DT_INT64, "DT_INT64"},
+      {DT_BOOL, "DT_BOOL"},
+      {DT_STRING, "DT_STRING"}
+  };
+  MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type";
+  return dtype_string_map[dt];
+}
 size_t GetEnumTypeSize(const DataType dt) {
  switch (dt) {

--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -18,6 +18,8 @@ bool DataTypeCanUseMemcpy(DataType dt);
 size_t GetEnumTypeSize(const DataType dt);
+std::string DataTypeToString(const DataType dt);
 template <class T>
 struct IsValidDataType;

--- a/mace/dsp/BUILD
+++ b/mace/dsp/BUILD
@@ -24,7 +24,7 @@ cc_library(
        "*.h",
        "hexagon/*.h",
    ]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    deps = [
        "//mace/proto:cc_proto",
        "//mace/core:core",
@@ -36,7 +36,7 @@ cc_test(
    name = "dsp_test",
    testonly = 1,
    srcs = glob(["*_test.cc"]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    linkopts = if_android([
        "-ldl",
        "-lm",
@@ -52,7 +52,7 @@ cc_test(
    name = "dsp_op_test",
    testonly = 1,
    srcs = glob(["test/*_test.cc"]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    linkopts = if_android([
        "-ldl",
        "-lm",
@@ -64,3 +64,21 @@ cc_test(
        "//mace/kernels:kernels",
    ],
 )
+cc_binary(
+    name = "mace_dsp_run",
+    srcs = [
+        "tool/mace_dsp_run.cc",
+    ],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
+    linkopts = if_android([
+        "-ldl",
+        "-lm",
+    ]),
+    linkstatic = 1,
+    deps = [
+        ":dsp",
+        "//mace/kernels:kernels",
+        "//mace/utils:command_line_flags",
+    ],
+)
\ No newline at end of file
--- a/mace/dsp/hexagon_control_wrapper.cc
+++ b/mace/dsp/hexagon_control_wrapper.cc
@@ -111,22 +111,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef& net_def) {
  }
  // input info
-  const InputInfo& input_info = net_def.input_info()[0];
+  num_inputs_ = 0;
-  input_shape_.insert(input_shape_.begin(),
+  for (const InputInfo &input_info: net_def.input_info()) {
-                      input_info.dims().begin(), input_info.dims().end());
+    vector<index_t> input_shape;
-  while (input_shape_.size() < 4) {
+    input_shape.insert(input_shape.begin(),
-    input_shape_.insert(input_shape_.begin(), 1);
+                       input_info.dims().begin(), input_info.dims().end());
+    while (input_shape.size() < 4) {
+      input_shape.insert(input_shape.begin(), 1);
+    }
+    input_shapes_.push_back(input_shape);
+    input_data_types_.push_back(input_info.data_type());
+    num_inputs_ += 1;
  }
-  input_data_type_ = input_info.data_type();
  // output info
-  const OutputInfo& output_info = net_def.output_info()[0];
+  num_outputs_ = 0;
-  output_shape_.insert(output_shape_.begin(),
+  for (const OutputInfo &output_info: net_def.output_info()) {
-                       output_info.dims().begin(), output_info.dims().end());
+    vector<index_t> output_shape;
-  while (output_shape_.size() < 4) {
+    output_shape.insert(output_shape.begin(),
-    output_shape_.insert(output_shape_.begin(), 1);
+                        output_info.dims().begin(), output_info.dims().end());
+    while (output_shape.size() < 4) {
+      output_shape.insert(output_shape.begin(), 1);
+    }
+    output_shapes_.push_back(output_shape);
+    output_data_types_.push_back(output_info.data_type());
+    num_outputs_ += 1;
  }
-  output_data_type_ = output_info.data_type();
  bool res =  hexagon_nn_prepare(nn_id_) == 0;
  return res;
@@ -218,4 +228,111 @@ void HexagonControlWrapper::ResetPerfInfo() {
  hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME);
 }
+bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
+                                         Tensor *output_tensor) {
+  LOG(INFO) << "Execute graph: " << nn_id_;
+  // single input and single output
+  MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num");
+  MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
+  output_tensor->SetDtype(output_data_types_[0]);
+  output_tensor->Resize(output_shapes_[0]);
+  vector<uint32_t> output_shape(4);
+  uint32_t output_bytes;
+  int res = hexagon_nn_execute(nn_id_,
+                               input_tensor.shape()[0],
+                               input_tensor.shape()[1],
+                               input_tensor.shape()[2],
+                               input_tensor.shape()[3],
+                               reinterpret_cast<const unsigned char *>(
+                                   input_tensor.raw_data()),
+                               input_tensor.raw_size(),
+                               &output_shape[0],
+                               &output_shape[1],
+                               &output_shape[2],
+                               &output_shape[3],
+                               reinterpret_cast<unsigned char *>(
+                                   output_tensor->raw_mutable_data()),
+                               output_tensor->raw_size(),
+                               &output_bytes);
+  MACE_ASSERT(output_shape == output_shapes_[0],
+              "wrong output shape inferred");
+  MACE_ASSERT(output_bytes == output_tensor->raw_size(),
+              "wrong output bytes inferred.");
+  return res == 0;
+};
+bool HexagonControlWrapper::ExecuteGraphNew(const vector<Tensor> &input_tensors,
+                                            vector<Tensor> *output_tensors) {
+  LOG(INFO) << "Execute graph new: " << nn_id_;
+  int num_inputs = input_tensors.size();
+  int num_outputs = output_tensors->size();
+  MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
+  MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
+  hexagon_nn_tensordef *inputs = new hexagon_nn_tensordef[num_inputs];
+  hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs];
+  for (int i = 0; i < num_inputs; ++i) {
+    vector<index_t> input_shape = input_tensors[i].shape();
+    inputs[i].batches = input_shape[0];
+    inputs[i].height = input_shape[1];
+    inputs[i].width = input_shape[2];
+    inputs[i].depth = input_shape[3];
+    inputs[i].data = const_cast<unsigned char *>(
+        reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
+    inputs[i].dataLen = input_tensors[i].raw_size();
+    inputs[i].data_valid_len = input_tensors[i].raw_size();
+    inputs[i].unused = 0;
+  }
+  for (int i = 0; i < num_outputs; ++i) {
+    (*output_tensors)[i].SetDtype(output_data_types_[i]);
+    (*output_tensors)[i].Resize(output_shapes_[i]);
+    outputs[i].data = reinterpret_cast<unsigned char *>(
+        (*output_tensors)[i].raw_mutable_data());
+    outputs[i].dataLen = (*output_tensors)[i].raw_size();
+  }
+  int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
+                                   outputs, num_outputs);
+  for (int i = 0; i < num_outputs; ++i) {
+    vector<uint32_t> output_shape {outputs[i].batches, outputs[i].height,
+                                   outputs[i].width, outputs[i].depth};
+    MACE_ASSERT(output_shape  == output_shapes_[i],
+                "wrong output shape inferred");
+    MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(),
+                "wrong output bytes inferred.");
+  }
+  delete [] inputs;
+  delete [] outputs;
+  return res == 0;
+};
+bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
+                                                    Tensor *output_tensor) {
+  vector<Tensor> input_tensors(3);
+  vector<Tensor> output_tensors(3);
+  input_tensors[0].SetDtype(DT_UINT8);
+  output_tensors[0].SetDtype(DT_UINT8);
+  input_tensors[0].ResizeLike(input_tensor);
+  input_tensors[1].Resize({1, 1, 1, 1});
+  float *min_in_data = input_tensors[1].mutable_data<float>();
+  input_tensors[2].Resize({1, 1, 1, 1});
+  float *max_in_data = input_tensors[2].mutable_data<float>();
+  quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data, max_in_data);
+  if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
+    return false;
+  }
+  output_tensor->ResizeLike(output_tensors[0]);
+  const float *min_out_data = output_tensors[1].data<float>();
+  const float *max_out_data = output_tensors[2].data<float>();
+  quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data, output_tensor);
+  return true;
+}
 } // namespace mace
\ No newline at end of file
--- a/mace/dsp/hexagon_control_wrapper.h
+++ b/mace/dsp/hexagon_control_wrapper.h
@@ -7,6 +7,7 @@
 #include "mace/dsp/hexagon/hexagon_controller.h"
 #include "mace/dsp/hexagon_nn_ops.h"
+#include "mace/dsp/util/quantize.h"
 #include "mace/core/common.h"
 #include "mace/core/tensor.h"
 #include "mace/proto/mace.pb.h"
@@ -23,35 +24,10 @@ class HexagonControlWrapper {
  bool Finalize();
  bool SetupGraph(const NetDef& net_def);
  bool SetupGraph(const std::string &model_file);
-  bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor) {
+  bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
-    LOG(INFO) << "Execute graph: " << nn_id_;
+  bool ExecuteGraphNew(const vector<Tensor>& input_tensors,
-    output_tensor->SetDtype(output_data_type_);
+                       vector<Tensor> *output_tensors);
-    output_tensor->Resize(output_shape_);
+  bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor);
-    vector<uint32_t> output_shape(4);
-    uint32_t output_bytes;
-    int res = hexagon_nn_execute(nn_id_,
-                                 input_tensor.shape()[0],
-                                 input_tensor.shape()[1],
-                                 input_tensor.shape()[2],
-                                 input_tensor.shape()[3],
-                                 reinterpret_cast<const unsigned char *>(
-                                     input_tensor.raw_data()),
-                                 input_tensor.raw_size(),
-                                 &output_shape[0],
-                                 &output_shape[1],
-                                 &output_shape[2],
-                                 &output_shape[3],
-                                 reinterpret_cast<unsigned char *>(
-                                     output_tensor->raw_mutable_data()),
-                                 output_tensor->raw_size(),
-                                 &output_bytes);
-    MACE_ASSERT(output_shape == output_shape_,
-                "wrong output shape inferred");
-    MACE_ASSERT(output_bytes == output_tensor->raw_size(),
-                "wrong output bytes inferred.");
-    return res == 0;
-  };
  bool TeardownGraph();
  void PrintLog();
@@ -70,11 +46,14 @@ class HexagonControlWrapper {
  int nn_id_;
  Serializer serializer_;
+  Quantizer quantizer_;
-  vector<index_t> input_shape_;
-  vector<index_t> output_shape_;
+  vector<vector<index_t>> input_shapes_;
-  DataType input_data_type_;
+  vector<vector<index_t>> output_shapes_;
-  DataType output_data_type_;
+  vector<DataType> input_data_types_;
+  vector<DataType> output_data_types_;
+  uint32_t num_inputs_;
+  uint32_t num_outputs_;
 DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
 };

--- a/mace/dsp/hexagon_control_wrapper_test.cc
+++ b/mace/dsp/hexagon_control_wrapper_test.cc
@@ -8,7 +8,7 @@
 using namespace mace;
-TEST(HexagonControlerWrapper, GetVersion) {
+TEST(HexagonControlerWrapper, InputFloat) {
  testing::internal::LogToStderr();
  HexagonControlWrapper wrapper;
  VLOG(0) << "version: " << wrapper.GetVersion();
@@ -29,7 +29,7 @@ TEST(HexagonControlerWrapper, GetVersion) {
  wrapper.ResetPerfInfo();
  timeval tv1, tv2;
  gettimeofday(&tv1, NULL);
-  int round = 2;
+  int round = 10;
  for (int i = 0; i < round; ++i) {
    VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
  }
@@ -49,6 +49,50 @@ TEST(HexagonControlerWrapper, GetVersion) {
  }
  std::cout << std::endl;
+  VLOG(0) << wrapper.TeardownGraph();
+  wrapper.Finalize();
+}
+TEST(HexagonControlerWrapper, PreQuantize) {
+  testing::internal::LogToStderr();
+  HexagonControlWrapper wrapper;
+  VLOG(0) << "version: " << wrapper.GetVersion();
+  wrapper.Init();
+  wrapper.SetDebugLevel(0);
+  wrapper.Config();
+  VLOG(0) << wrapper.SetupGraph("quantized_icnet_dsp_u8.pb");
+  wrapper.PrintGraph();
+  Tensor input_tensor;
+  Tensor output_tensor;
+  input_tensor.Resize({1, 480, 480, 3});
+  float *input_data = input_tensor.mutable_data<float>();
+  for (int i = 0; i < input_tensor.size(); ++i) {
+    input_data[i] = i % 256;
+  }
+  wrapper.ResetPerfInfo();
+  timeval tv1, tv2;
+  gettimeofday(&tv1, NULL);
+  int round = 10;
+  for (int i = 0; i < round; ++i) {
+    VLOG(0) << wrapper.ExecuteGraphPreQuantize(input_tensor, &output_tensor);
+  }
+  gettimeofday(&tv2, NULL);
+  VLOG(0) << "avg duration: "
+          << ((tv2.tv_sec - tv1.tv_sec) * 1000 +
+              (tv2.tv_usec - tv1.tv_usec) / 1000) /
+              round;
+  wrapper.GetPerfInfo();
+  wrapper.PrintLog();
+  const float *output_data = output_tensor.data<float>();
+  for (int i = 0; i < output_tensor.size(); ++i) {
+    std::cout << output_data[i] << " ";
+  }
+  std::cout << std::endl;
  VLOG(0) << wrapper.TeardownGraph();
  wrapper.Finalize();
 }
\ No newline at end of file
--- a/mace/dsp/test/quantized_resize_bilinear_test.cc
+++ b/mace/dsp/test/quantized_resize_bilinear_test.cc
@@ -5,6 +5,7 @@
 #include "mace/dsp/hexagon_control_wrapper.h"
 #include "gtest/gtest.h"
+#define RESIZE_BILINEAR_TEST_CHANNELS 128
 using namespace mace;
 static NetDef BuildNetDef() {
@@ -17,7 +18,7 @@ static NetDef BuildNetDef() {
  input_op->set_type("INPUT");
  input_op->set_node_id(0);
  input_op->set_padding(0);
-  input_op->add_out_max_byte_size(1000);
+  input_op->add_out_max_byte_size(1200);
  // relu op
  OperatorDef *resize_bilinear_op = net.add_op();
@@ -45,7 +46,7 @@ static NetDef BuildNetDef() {
  input_node_input = resize_bilinear_op->add_node_input();
  input_node_input->set_node_id(12);
  input_node_input->set_output_port(0);
-  resize_bilinear_op->add_out_max_byte_size(1000);
+  resize_bilinear_op->add_out_max_byte_size(1200);
  resize_bilinear_op->add_out_max_byte_size(1000);
  resize_bilinear_op->add_out_max_byte_size(1000);
@@ -64,8 +65,8 @@ static NetDef BuildNetDef() {
  new_dim_tensor->add_dims(2);
  new_dim_tensor->set_data_type(DataType::DT_INT32);
  new_dim_tensor->set_node_id(10);
-  new_dim_tensor->add_int32_data(1);
+  new_dim_tensor->add_int32_data(2);
-  new_dim_tensor->add_int32_data(1);
+  new_dim_tensor->add_int32_data(2);
  TensorProto *input_min_tensor = net.add_tensors();
  input_min_tensor->set_name("input_min");
@@ -86,20 +87,20 @@ static NetDef BuildNetDef() {
  input_info->set_name("input_node");
  input_info->set_node_id(0);
  input_info->add_dims(1);
-  input_info->add_dims(2);
+  input_info->add_dims(3);
-  input_info->add_dims(2);
+  input_info->add_dims(3);
-  input_info->add_dims(128);
+  input_info->add_dims(RESIZE_BILINEAR_TEST_CHANNELS);
  input_info->set_data_type(DataType::DT_UINT8);
-  input_info->set_max_byte_size(1000);
+  input_info->set_max_byte_size(1200);
  OutputInfo *output_info = net.add_output_info();
  output_info->set_name("output_node");
  output_info->set_node_id(1);
  output_info->add_dims(1);
-  output_info->add_dims(1);
+  output_info->add_dims(2);
-  output_info->add_dims(1);
+  output_info->add_dims(2);
-  output_info->add_dims(128);
+  output_info->add_dims(RESIZE_BILINEAR_TEST_CHANNELS);
  output_info->set_data_type(DataType::DT_UINT8);
-  output_info->set_max_byte_size(1000);
+  output_info->set_max_byte_size(1200);
  return net;
 }
@@ -117,21 +118,25 @@ TEST(QuantizedResizeBilinearTest, QuantizedResizeBilinear) {
  Allocator *cpu_allocator = GetDeviceAllocator(DeviceType::CPU);
  Tensor input_tensor(cpu_allocator, DT_UINT8);
  Tensor output_tensor(cpu_allocator, DT_UINT8);
-  input_tensor.Resize({1, 2, 2, 128});
+  input_tensor.Resize({1, 3, 3, RESIZE_BILINEAR_TEST_CHANNELS});
-  output_tensor.Resize({1, 1, 1, 128});
+  output_tensor.Resize({1, 2, 2, RESIZE_BILINEAR_TEST_CHANNELS});
  uint8_t *input_data = input_tensor.mutable_data<uint8_t>();
  const uint8_t *output_data = output_tensor.data<uint8_t>();
-  for (int c = 0; c < 128; ++c) {
+  for (int wh = 0; wh < 9; ++wh) {
-    input_data[c] = input_data[c + 128] = input_data[c + 256]
+    for (int c = 0; c < RESIZE_BILINEAR_TEST_CHANNELS; ++c) {
-        = input_data[c + 384] = (uint8_t)c;
+      input_data[wh * RESIZE_BILINEAR_TEST_CHANNELS + c] = 9 - wh;
+    }
  }
  VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
  wrapper.PrintLog();
-  for (int i = 0; i < output_tensor.size(); ++i) {
+  vector<uint8_t> expected {9, 8, 5, 3};
-    EXPECT_EQ(i, output_data[i]);
+  for (int i = 0; i < 4; ++i) {
+    for (int c = 0; c < RESIZE_BILINEAR_TEST_CHANNELS; ++c)
+      EXPECT_EQ(expected[i],
+                output_data[i * RESIZE_BILINEAR_TEST_CHANNELS + c]);
  }
  std::cout << std::endl;

--- a/mace/dsp/tool/mace_dsp_run.cc
+++ b/mace/dsp/tool/mace_dsp_run.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+/**
+ * Usage:
+ * mace_dsp_run --model=mobi_mace.pb \
+ *          --input_shape=1,3,224,224   \
+ *          --input_file=input_data \
+ *          --output_file=mace.out
+ */
+#include <sys/time.h>
+#include <fstream>
+#include "mace/dsp/hexagon_control_wrapper.h"
+#include "mace/core/net.h"
+#include "mace/utils/command_line_flags.h"
+using namespace std;
+using namespace mace;
+void ParseShape(const string &str, vector<index_t> *shape) {
+  string tmp = str;
+  while (!tmp.empty()) {
+    int dim = atoi(tmp.data());
+    shape->push_back(dim);
+    size_t next_offset = tmp.find(",");
+    if (next_offset == string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+int main(int argc, char **argv) {
+  string model_file;
+  string input_shape;
+  string input_file;
+  string output_file;
+  int round = 1;
+  std::vector<Flag> flag_list = {
+      Flag("model", &model_file, "model file name"),
+      Flag("input_shape", &input_shape, "input shape, separated by comma"),
+      Flag("input_file", &input_file, "input file name"),
+      Flag("output_file", &output_file, "output file name"),
+      Flag("round", &round, "round"),
+  };
+  string usage = Flags::Usage(argv[0], flag_list);
+  const bool parse_result = Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << usage;
+    return -1;
+  }
+  VLOG(0) << "model: " << model_file << std::endl
+          << "input_shape: " << input_shape << std::endl
+          << "input_file: " << input_file << std::endl
+          << "output_file: " << output_file << std::endl
+          << "round: " << round << std::endl;
+  vector<index_t> shape;
+  ParseShape(input_shape, &shape);
+  // load input
+  Tensor input_tensor;
+  input_tensor.Resize(shape);
+  float *input_data = input_tensor.mutable_data<float>();
+  ifstream in_file(input_file, ios::in | ios::binary);
+  in_file.read(reinterpret_cast<char *>(input_data),
+               input_tensor.size() * sizeof(float));
+  in_file.close();
+  // execute
+  HexagonControlWrapper wrapper;
+  VLOG(0) << "version: " << wrapper.GetVersion();
+  wrapper.Init();
+  wrapper.SetDebugLevel(0);
+  wrapper.Config();
+  VLOG(0) << wrapper.SetupGraph(model_file);
+  wrapper.PrintGraph();
+  Tensor output_tensor;
+  timeval tv1, tv2;
+  gettimeofday(&tv1, NULL);
+  for (int i = 0; i < round; ++i) {
+    VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
+  }
+  gettimeofday(&tv2, NULL);
+  cout << "avg duration: "
+       << ((tv2.tv_sec - tv1.tv_sec) * 1000 +
+           (tv2.tv_usec - tv1.tv_usec) / 1000) /
+           round
+       << endl;
+  wrapper.GetPerfInfo();
+  wrapper.PrintLog();
+  VLOG(0) << wrapper.TeardownGraph();
+  wrapper.Finalize();
+  // save output
+  ofstream out_file(output_file, ios::binary);
+  out_file.write((const char *) (output_tensor.data<float>()),
+                 output_tensor.size() * sizeof(float));
+  out_file.flush();
+  out_file.close();
+}
\ No newline at end of file
--- a/mace/dsp/util/BUILD
+++ b/mace/dsp/util/BUILD
@@ -20,7 +20,7 @@ cc_library(
    hdrs = glob([
        "*.h",
    ]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    deps = [
        "//mace/core:core",
    ],

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -10,15 +10,23 @@
 namespace mace {
 namespace kernels {
-template<DeviceType D, typename T>
+struct AddNFunctorBase {};
-struct AddNFunctor {
-  void operator()(std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
+template <DeviceType D, typename T>
+struct AddNFunctor : AddNFunctorBase {
+  void operator()(const std::vector<const Tensor *> &input_tensors,
+                  Tensor *output_tensor) {
+    output_tensor->ResizeLike(input_tensors[0]);
    Tensor::MappingGuard output_map(output_tensor);
    index_t size = input_tensors[0]->size();
    T *output_ptr = output_tensor->mutable_data<T>();
    memset(output_ptr, 0, size * sizeof(T));
    int n = input_tensors.size();
    for (int i = 0; i < n; ++i) {
+      MACE_CHECK(input_tensors[i]->dim(0) == output_tensor->dim(0));
+      MACE_CHECK(input_tensors[i]->dim(1) == output_tensor->dim(1));
+      MACE_CHECK(input_tensors[i]->dim(2) == output_tensor->dim(2));
+      MACE_CHECK(input_tensors[i]->dim(3) == output_tensor->dim(3));
      Tensor::MappingGuard input_map(input_tensors[i]);
      const T *input_ptr = input_tensors[i]->data<T>();
      for (index_t j = 0; j < size; ++j) {
@@ -28,15 +36,17 @@ struct AddNFunctor {
  }
 };
-template<>
+template <>
 void AddNFunctor<DeviceType::NEON, float>::operator()(
-    std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
-template<>
+template <typename T>
-void AddNFunctor<DeviceType::OPENCL, float>::operator()(
+struct AddNFunctor<DeviceType::OPENCL, T> : AddNFunctorBase {
-    std::vector<const Tensor *> &inputs, Tensor *output);
+  void operator()(const std::vector<const Tensor *> &input_tensors,
+                  Tensor *output_tensor);
+};
 }  //  namespace kernels
 }  //  namespace mace
 #endif  // MACE_KERNELS_ADDN_H_
\ No newline at end of file
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -28,9 +28,10 @@ struct BatchNormFunctor {
    // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
    // new_offset = \offset - mean * common_val;
    // Y = new_scale * X + new_offset;
-    const index_t n = input->dim(0);
+    const index_t batch = input->dim(0);
-    const index_t channel = input->dim(1);
+    const index_t height = input->dim(1);
-    const index_t sample_size = input->dim(2) * input->dim(3);
+    const index_t width = input->dim(2);
+    const index_t channels = input->dim(3);
    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard scale_mapper(scale);
@@ -48,19 +49,26 @@ struct BatchNormFunctor {
    const T *epsilon_ptr = epsilon->data<T>();
    T *output_ptr = output->mutable_data<T>();
+    vector<T> new_scale(channels);
+    vector<T> new_offset(channels);
 #pragma omp parallel for
-    for (index_t c = 0; c < channel; ++c) {
+    for (index_t c = 0; c < channels; ++c) {
-      T new_scale = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr);
+      new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr);
-      T new_offset = offset_ptr[c] - mean_ptr[c] * new_scale;
+      new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
-      index_t pos = c * sample_size;
+    }
+    index_t pos = 0;
-      for (index_t i = 0; i < n; ++i) {
+#pragma omp parallel for
-        const T *input_sample_ptr = input_ptr + pos;
+    for (index_t n = 0; n < batch; ++n) {
-        T *output_sample_ptr = output_ptr + pos;
+      for (index_t h = 0; h < height; ++h) {
-        for (index_t j = 0; j < sample_size; ++j) {
+        for (index_t w = 0; w < width; ++w) {
-          output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset;
+          for (index_t c = 0; c < channels; ++c) {
+            output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
+            ++pos;
+          }
        }
-        pos += channel * sample_size;
      }
    }
  }
@@ -76,15 +84,16 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *epsilon,
    Tensor *output);
-template <>
+template <typename T>
-void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
+struct BatchNormFunctor<DeviceType::OPENCL, T> {
-      const Tensor *input,
+  void operator()(const Tensor *input,
-      const Tensor *scale,
+                  const Tensor *scale,
-      const Tensor *offset,
+                  const Tensor *offset,
-      const Tensor *mean,
+                  const Tensor *mean,
-      const Tensor *var,
+                  const Tensor *var,
-      const Tensor *epsilon,
+                  const Tensor *epsilon,
-      Tensor *output);
+                  Tensor *output);
+};
 }  //  namepsace kernels
 }  //  namespace mace

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -11,13 +11,23 @@
 namespace mace {
 namespace kernels {
+struct Conv2dFunctorBase {
+  Conv2dFunctorBase(const int *strides,
+                    const Padding &paddings,
+                    const int *dilations)
+      : strides_(strides), dilations_(dilations), paddings_(paddings) {}
+  const int *strides_;         // [stride_h, stride_w]
+  const int *dilations_;       // [dilation_h, dilation_w]
+  Padding paddings_;
+};
 template<DeviceType D, typename T>
-struct Conv2dFunctor {
+struct Conv2dFunctor : Conv2dFunctorBase {
-  Conv2dFunctor() {}
  Conv2dFunctor(const int *strides,
                const Padding &paddings,
                const int *dilations)
-      : strides_(strides), dilations_(dilations), paddings_(paddings) {}
+      : Conv2dFunctorBase(strides, paddings, dilations) {}
  void operator()(const Tensor *input,
                  const Tensor *filter,
@@ -76,9 +86,10 @@ struct Conv2dFunctor {
      for (int h = 0; h < height; ++h) {
        for (int w = 0; w < width; ++w) {
          for (int c = 0; c < channels; ++c) {
-            T bias_channel = bias_data ? bias_data[c] : 0;
+            T bias_channel = 0.0f;
+            if (bias) bias_channel = bias_data[c];
            *output_data = bias_channel;
-            T sum = 0;
+            T sum = 0.0f;
            const T *filter_ptr = filter_data + c;
            for (int kh = 0; kh < kernel_h; ++kh) {
              for (int kw = 0; kw < kernel_w; ++kw) {
@@ -113,9 +124,6 @@ struct Conv2dFunctor {
  }
-  const int *strides_;         // [stride_h, stride_w]
-  const int *dilations_;       // [dilation_h, dilation_w]
-  Padding paddings_;
 };
 template<>
@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                                                        const Tensor *filter,
                                                        const Tensor *bias,
                                                        Tensor *output);
-template<>
-void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
+template<typename T>
-                                                          const Tensor *filter,
+struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
-                                                          const Tensor *bias,
+  Conv2dFunctor(const int *strides,
-                                                          Tensor *output);
+                const Padding &paddings,
+                const int *dilations)
+      : Conv2dFunctorBase(strides, paddings, dilations) {}
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  Tensor *output);
+};
 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/fused_conv_2d.h
+++ b/mace/kernels/fused_conv_2d.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#ifndef MACE_KERNELS_FUSED_CONV_2D_H_
+#define MACE_KERNELS_FUSED_CONV_2D_H_
+#include "mace/core/tensor.h"
+#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/kernels/conv_2d.h"
+namespace mace {
+namespace kernels {
+struct FusedConv2dFunctorBase {
+  FusedConv2dFunctorBase(const int *strides,
+                         const Padding &paddings,
+                         const int *dilations)
+      : strides_(strides), dilations_(dilations), paddings_(paddings) {}
+  const int *strides_;         // [stride_h, stride_w]
+  const int *dilations_;       // [dilation_h, dilation_w]
+  Padding paddings_;
+};
+template<DeviceType D, typename T>
+struct FusedConv2dFunctor : FusedConv2dFunctorBase {
+  FusedConv2dFunctor(const int *strides,
+                     const Padding &paddings,
+                     const int *dilations)
+      : FusedConv2dFunctorBase(strides, paddings, dilations) {}
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  Tensor *output) {
+    Conv2dFunctor<D, T>(strides_, paddings_, dilations_)(input, filter, bias, output);
+    T *output_data = output->mutable_data<T>();
+    T zero_value;
+    if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
+      zero_value = half_float::half_cast<half>(0.0f);
+    } else {
+      zero_value = 0;
+    }
+    auto output_size = output->size();
+    for (int n = 0; n < output_size; ++n) {
+      *output_data = *output_data < 0 ? zero_value : *output_data;
+      output_data++;
+    }
+  }
+};
+template<typename T>
+struct FusedConv2dFunctor<DeviceType::OPENCL, T> : FusedConv2dFunctorBase {
+  FusedConv2dFunctor(const int *strides,
+                     const Padding &paddings,
+                     const int *dilations)
+      : FusedConv2dFunctorBase(strides, paddings, dilations) {}
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  Tensor *output);
+};
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_FUSED_CONV_2D_H_
--- a/mace/kernels/neon/addn_neon.cc
+++ b/mace/kernels/neon/addn_neon.cc
@@ -10,7 +10,7 @@ namespace kernels {
 template <>
 void AddNFunctor<DeviceType::NEON, float>::operator()(
-    std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
  // TODO: neon mem copy
  index_t size = output_tensor->size();
  float *output_ptr = output_tensor->mutable_data<float>();
@@ -51,4 +51,4 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
 };
 }  // namespace kernels
 }  // namespace mace
\ No newline at end of file
--- a/mace/kernels/neon/pooling_neon.cc
+++ b/mace/kernels/neon/pooling_neon.cc
@@ -58,19 +58,27 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *input_tensor,
    Tensor *output_tensor) {
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  std::vector<index_t> filter_shape(4);
+  filter_shape[0] = input_tensor->shape()[1];
+  filter_shape[1] = input_tensor->shape()[1];
+  filter_shape[2] = kernels_[0];
+  filter_shape[3] = kernels_[1];
+  kernels::CalcPaddingAndOutputSize(
+      input_tensor->shape().data(), filter_shape.data(), this->dilations_,
+      strides_, this->padding_, output_shape.data(),
+      paddings.data());
+  output_tensor->Resize(output_shape);
  const float *input = input_tensor->data<float>();
  float *output = output_tensor->mutable_data<float>();
  const index_t *input_shape = input_tensor->shape().data();
-  const index_t *output_shape = output_tensor->shape().data();
-  int paddings[2];
-  std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
-                                       kernels_[0], kernels_[1]};
-  kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
-                          strides_, this->padding_, paddings);
 #ifdef __COPY_MAKE_PADDING
  Tensor padded_input;
-  ConstructInputWithPadding(input_tensor, paddings, &padded_input);
+  ConstructInputWithPadding(input_tensor, paddings.data(), &padded_input);
  input = padded_input.data<float>();
  input_shape = padded_input.shape().data();
 #endif
@@ -80,17 +88,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
    // kernel_size: 2x2, strides: 2x2
    if (pooling_type_ == MAX) {  // MAX_POOL_2x2s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
+      PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape,
+      PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
-                             paddings);
+                             paddings.data());
 #endif
    } else {  // AVG_POOL_2x2s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
+      PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape,
+      PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
-                             paddings);
+                             paddings.data());
 #endif
    }
  } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 &&
@@ -98,17 +106,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
    // kernel_size: 3x3, strides: 2x2
    if (pooling_type_ == MAX) {  // MAX_POOL_3x3s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
+      PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape,
+      PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
-                             paddings);
+                             paddings.data());
 #endif
    } else {  // AVG_POOL_3x3s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
+      PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape,
+      PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
-                             paddings);
+                             paddings.data());
 #endif
    }
  } else {  // not implement yet

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -5,52 +5,83 @@
 #include "mace/kernels/addn.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
+template <typename T>
-  index_t element_size = input0->NumElements();
+static void AddN(const std::vector<const Tensor *> &input_tensors,
-  index_t blocks = (element_size + 3) / 4;
+                 Tensor *output) {
+  if (input_tensors.size() > 4) {
+    MACE_NOT_IMPLEMENTED;
+  }
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
-  const uint32_t gws = blocks;
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_pixels = channel_blocks * width;
+  const index_t batch_height_pixels = batch * height;
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(output->dtype()));
+  auto dt = DataTypeToEnum<T>::value;
-  auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options);
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
+  auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options);
  const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
  uint32_t idx = 0;
-  addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input0->buffer())));
+  for (auto input : input_tensors) {
-  addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input1->buffer())));
+  addn_kernel.setArg(idx++,
-  addn_kernel.setArg(idx++, static_cast<int32_t>(element_size));
+                     *(static_cast<const cl::Image2D *>(input->buffer())));
-  addn_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  }
+  addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      addn_kernel, cl::NullRange,
-      cl::NDRange(gws),
+      cl::NDRange(width_pixels, batch_height_pixels),
-      cl::NDRange(lws),
+      cl::NDRange(64, 16),  // TODO fix this
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
+  MACE_CHECK(error == CL_SUCCESS) << "error code: " << error;
 }
-template<>
+template <typename T>
-void AddNFunctor<DeviceType::OPENCL, float>::operator()(std::vector<const Tensor *> &input_tensors,
+void AddNFunctor<DeviceType::OPENCL, T>::operator()(
-                                                        Tensor *output_tensor) {
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
-  if (input_tensors.empty() || input_tensors.front() == nullptr) {
-    return;
-  }
  size_t size = input_tensors.size();
+  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
+  const index_t batch = input_tensors[0]->dim(0);
+  const index_t height = input_tensors[0]->dim(1);
+  const index_t width = input_tensors[0]->dim(2);
+  const index_t channels = input_tensors[0]->dim(3);
-  switch (size) {
+  for (int i = 1; i < size; ++i) {
-    case 2:Add2(input_tensors[0], input_tensors[1], output_tensor);
+    MACE_CHECK_NOTNULL(input_tensors[i]);
-      break;
+    MACE_CHECK(batch == input_tensors[i]->dim(0));
-    default:MACE_NOT_IMPLEMENTED;
+    MACE_CHECK(height == input_tensors[i]->dim(1));
+    MACE_CHECK(width == input_tensors[i]->dim(2));
+    MACE_CHECK(channels == input_tensors[i]->dim(3));
  }
+  std::vector<index_t> output_shape = input_tensors[0]->shape();
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+  output_tensor->ResizeImage(output_shape, output_image_shape);
+  AddN<T>(input_tensors, output_tensor);
 };
+template
+struct AddNFunctor<DeviceType::OPENCL, float>;
+template
+struct AddNFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
-} //  namespace mace
+}  //  namespace mace
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -11,8 +11,8 @@
 namespace mace {
 namespace kernels {
-template <>
+template <typename T>
-void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
+void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *input,
    const Tensor *scale,
    const Tensor *offset,
@@ -21,35 +21,39 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
    const Tensor *epsilon,
    Tensor *output) {
-  index_t pixel_size = input->dim(2) * input->dim(3);
+  const index_t batch = input->dim(0);
-  index_t blocks = (pixel_size + 3) / 4;
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
-  const uint32_t gws[3] = {static_cast<uint32_t>(input->dim(0)),
+  const index_t channel_blocks = RoundUpDiv4(channels);
-                           static_cast<uint32_t>(input->dim(1)),
-                           static_cast<uint32_t>(blocks)};
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto dt = DataTypeToEnum<T>::value;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
  auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
-  const std::vector<uint32_t> lws = {1, 1, kwg_size};
+  const std::vector<uint32_t> lws = {1, kwg_size, 1};
  uint32_t idx = 0;
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(scale->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(scale->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(offset->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(offset->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(mean->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(mean->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(var->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(var->buffer())));
  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(epsilon->buffer())));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(pixel_size));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-  bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr);
-  bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr);
  auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
-    return {{1, 1, 64},
+    return {{8, 128, 1}, //SNPE size
+            {1, 1, 64},
            {1, 1, 128},
            {1, kwg_size/16, 16},
            {1, kwg_size/32, 32},
@@ -80,5 +84,9 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
                                                     func);
 }
+template
+struct BatchNormFunctor<DeviceType::OPENCL, float>;
+template
+struct BatchNormFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -24,8 +24,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
  }
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype()));
+  if (buffer->dtype() == image->dtype()) {
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(image->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
+  } else {
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+  }
  auto runtime = OpenCLRuntime::Get();
  string kernel_name;
  switch (type) {

--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
 #include <common.h>
-// Supported data type: half/float
+__kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
-__kernel void add2(__global const DATA_TYPE *input0,
+                   __read_only image2d_t input1,
-                   __global const DATA_TYPE *input1,
+#if INPUT_NUM > 2
-                   __private const int size,
+                   __read_only image2d_t input2,
-                   __global DATA_TYPE *output) {
+#endif
-  int idx = get_global_id(0);
+#if INPUT_NUM > 3
+                   __read_only image2d_t input3,
+#endif
+                   __write_only image2d_t output) {
+  const int w = get_global_id(0);
+  const int hb = get_global_id(1);
-  if (idx + 4 > size) {
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-    for(; idx < size; ++idx) {
-      *(output+idx) = *(input0+idx) + *(input1+idx);
+  DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb));
-    }
+  DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb));
-  } else {
+  DATA_TYPE4 out = in0 + in1;
-    VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0);
-    VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1);
+#if INPUT_NUM > 2
-    vstore4(in_data0+in_data1, idx, output);
+  DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb));
-  }
+  out = out + in2;
+#endif
+#if INPUT_NUM > 3
+  DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb));
+  out = out + in3;
+#endif
+  WRITE_IMAGET(output, (int2)(w, hb), out);
 }
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
 #include <common.h>
 // Supported data types: half/float
-void kernel batch_norm(global const DATA_TYPE *input,
+__kernel void batch_norm(__read_only image2d_t input,
-                       global const DATA_TYPE *scale,
+                         __read_only image2d_t scale,
-                       global const DATA_TYPE *offset,
+                         __read_only image2d_t offset,
-                       global const DATA_TYPE *mean,
+                         __read_only image2d_t mean,
-                       global const DATA_TYPE *var,
+                         __read_only image2d_t var,
-                       global const DATA_TYPE *epsilon,
+                         __global const DATA_TYPE *epsilon,
-                       private const int pixels,
+                         __write_only image2d_t output) {
-                       global DATA_TYPE *output,
+  const int ch_blk = get_global_id(0);
-                       __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_scale,
+  const int w = get_global_id(1);
-                       __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_offset) {
+  const int hb = get_global_id(2);
-  const int batch = get_global_id(0);
+  const int width = get_global_size(1);
-  const int channel = get_global_id(1);
-  const int channels = get_global_size(1);
-  const int pixel_offset = get_global_id(2);
-  const int local_channel = get_local_id(1);
-  const int local_pixel_idx = get_local_id(2);
-  if(local_pixel_idx == 0) {
+  DATA_TYPE4 scale_value = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
-    new_scale[local_channel] = (float4)(scale[channel] * rsqrt(var[channel] + *epsilon));
+  DATA_TYPE4 offset_value = READ_IMAGET(offset, SAMPLER, (int2)(ch_blk, 0));
-    new_offset[local_channel] = (float4)(offset[channel] - mean[channel] * new_scale[local_channel].x);
+  DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0));
-  }
+  DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0));
-  barrier(CLK_LOCAL_MEM_FENCE);
+  DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)(*epsilon));
+  DATA_TYPE4 new_offset = offset_value - mean_value * new_scale;
-  const int image_offset = (batch * channels + channel) * pixels + pixel_offset*4;
+  const int pos = ch_blk * width + w;
-  const DATA_TYPE *input_ptr = input + image_offset;
-  DATA_TYPE *output_ptr = output + image_offset;
-  const int end = (batch * channels + channel + 1) * pixels;
-  if ((image_offset+4) > end) {
-    for (int i = image_offset; i < end; ++i) {
-      *output_ptr = new_scale[local_channel].x * *input_ptr + new_offset[local_channel].x;
-      ++input_ptr;
-      ++output_ptr;
-    }
-  } else {
-    VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input_ptr);
-    values = values * new_scale[local_channel] + new_offset[local_channel];
-    vstore4(values, 0, output_ptr);
-  }
-}
+  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
+  DATA_TYPE4 out = in * new_scale + new_offset;
+  WRITE_IMAGET(output, (int2)(pos, hb), out);
+}
--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -14,4 +14,11 @@
 #define CMD_TYPE_STR(cmd, type) cmd##type
 #define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
+#define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4)
+#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
+#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
+__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 #endif  // MACE_KERNELS_OPENCL_CL_COMMON_H_
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
+#include <common.h>
+__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+                      __read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */
+#ifdef BIAS
+    __read_only image2d_t bias, /* cout%4 * cout/4 */
+#endif
+                      __write_only image2d_t output,
+                      __private const int in_height,
+                      __private const int in_width,
+                      __private const int in_ch_blks,
+                      __private const int out_height,
+                      __private const int out_width,
+                      __private const int filter_height,
+                      __private const int filter_width,
+                      __private const int padding_top,
+                      __private const int padding_left) {
+  const int out_ch_blk = get_global_id(0);
+  const int out_w_blk = get_global_id(1);
+  const int out_w_blks = get_global_size(1);
+  const int out_hb = get_global_id(2);
+  const int rounded_in_ch = in_ch_blks * 4;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+#ifdef BIAS
+  DATA_TYPE4 out0 =
+     READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+  DATA_TYPE4 out1 = out0;
+  DATA_TYPE4 out2 = out0;
+  DATA_TYPE4 out3 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+#endif
+#if STRIDE == 1
+  int in_width0 = out_w_blk - padding_left;
+  int in_width1 = in_width0 + out_w_blks;
+  int in_width2 = in_width1 + out_w_blks;
+  int in_width3 = in_width2 + out_w_blks;
+  const int height_idx = (out_hb % out_height) - padding_top;
+#else
+  int in_width0 = out_w_blk * 2 - padding_left;
+  int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
+  int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
+  int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
+  const int height_idx = (out_hb % out_height) * 2 - padding_top;
+#endif
+  const int batch_idx = (out_hb / out_height) * in_height;
+  DATA_TYPE4 in0, in1, in2, in3;
+  DATA_TYPE4 weights0, weights1, weights2, weights3;
+  int in_idx, in_width_idx;
+  // Unrolling this loop hurt perfmance
+  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
+      int in_hb_value = height_idx + hb_idx;
+      in_hb_value = select(in_hb_value + batch_idx,
+                           -1,
+                           (in_hb_value < 0 || in_hb_value >= in_height));
+      for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
+        in_idx = in_ch_blk * in_width;
+        int in_width_value;
+#define READ_INPUT(i)                                                                \
+        in_width_value = in_width##i + width_idx;                                    \
+        in_width_value = select(in_idx + in_width_value,                             \
+                                -1,                                                  \
+                                (in_width_value < 0 || in_width_value >= in_width)); \
+        in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
+        READ_INPUT(0);
+        READ_INPUT(1);
+        READ_INPUT(2);
+        READ_INPUT(3);
+#undef READ_INPUT
+        int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch;
+        weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
+        weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
+        weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
+        weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+        // Will prefetch L2 improve performance? How to pretch image data?
+        // Interleaving load and mul does not improve performance as expected
+        out0 += in0.x * weights0;
+        out0 += in0.y * weights1;
+        out0 += in0.z * weights2;
+        out0 += in0.w * weights3;
+        out1 += in1.x * weights0;
+        out1 += in1.y * weights1;
+        out1 += in1.z * weights2;
+        out1 += in1.w * weights3;
+        out2 += in2.x * weights0;
+        out2 += in2.y * weights1;
+        out2 += in2.z * weights2;
+        out2 += in2.w * weights3;
+        out3 += in3.x * weights0;
+        out3 += in3.y * weights1;
+        out3 += in3.z * weights2;
+        out3 += in3.w * weights3;
+      }
+    }
+  }
+#ifdef FUSED_RELU
+  // TODO relux
+  out0 = fmax(out0, 0);
+  out1 = fmax(out1, 0);
+  out2 = fmax(out2, 0);
+  out3 = fmax(out3, 0);
+#endif
+  const int out_x_base = out_ch_blk * out_width;
+  int w = out_w_blk;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out0);
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out1);
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out2);
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out3);
+}
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
 #include <common.h>
-#define vec_conv_2d_1x1_s1                    \
-  VEC_DATA_TYPE(DATA_TYPE,4) in0 = vload4(0, input_ptr);                   \
-  VEC_DATA_TYPE(DATA_TYPE,4) in1 = vload4(0, input_ptr + in_pixel);        \
-  VEC_DATA_TYPE(DATA_TYPE,4) in2 = vload4(0, input_ptr + 2 * in_pixel);    \
-  VEC_DATA_TYPE(DATA_TYPE,4) in3 = vload4(0, input_ptr + 3 * in_pixel);
-#define vec_conv_2d_1x1_s2                    \
-  VEC_DATA_TYPE(DATA_TYPE,4) in00 = vload4(0, input_ptr);                   \
-  VEC_DATA_TYPE(DATA_TYPE,3) in01 = vload3(0, input_ptr + 4);               \
-  VEC_DATA_TYPE(DATA_TYPE,4) in10 = vload4(0, input_ptr + in_pixel);        \
-  VEC_DATA_TYPE(DATA_TYPE,3) in11 = vload3(0, input_ptr + in_pixel + 4);    \
-  VEC_DATA_TYPE(DATA_TYPE,4) in20 = vload4(0, input_ptr + 2 * in_pixel);    \
-  VEC_DATA_TYPE(DATA_TYPE,3) in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\
-  VEC_DATA_TYPE(DATA_TYPE,4) in30 = vload4(0, input_ptr + 3 * in_pixel);    \
-  VEC_DATA_TYPE(DATA_TYPE,3) in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \
-  VEC_DATA_TYPE(DATA_TYPE,4) in0 = (VEC_DATA_TYPE(DATA_TYPE,4))(in00.s02, in01.s02);            \
-  VEC_DATA_TYPE(DATA_TYPE,4) in1 = (VEC_DATA_TYPE(DATA_TYPE,4))(in10.s02, in11.s02);            \
-  VEC_DATA_TYPE(DATA_TYPE,4) in2 = (VEC_DATA_TYPE(DATA_TYPE,4))(in20.s02, in21.s02);            \
-  VEC_DATA_TYPE(DATA_TYPE,4) in3 = (VEC_DATA_TYPE(DATA_TYPE,4))(in30.s02, in31.s02);
-#define vec_conv_2d_1x1_compute_loop  \
-  for (int oc = 0; oc < 4; ++oc) {                             \
-    VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr + oc * in_chan_num); \
-    VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr + oc * out_pixel);       \
-    out += in0 * weights.x;                                    \
-    out += in1 * weights.y;                                     \
-    out += in2 * weights.z;                                     \
-    out += in3 * weights.w;                                     \
-    vstore4(out, 0, output_ptr + oc * out_pixel);               \
-  }
-#define vec_conv_2d_1x1_compute  \
-    VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr); \
-    VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr);       \
-    out += in0 * weights.x;                                    \
-    out += in1 * weights.y;                                     \
-    out += in2 * weights.z;                                     \
-    out += in3 * weights.w;                                     \
-    vstore4(out, 0, output_ptr);
-// Supported data type: half/float
-__kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
-                             __global const DATA_TYPE *filter, /* o, i, kh, kw */
-#ifdef BIAS
-                             __global const DATA_TYPE *bias, /* o */
-#endif /* defined(BIAS) */
-                             __global DATA_TYPE *output, /* n, c, h, w */
-                             __private const int in_chan_num,
-                             __private const int out_chan_num,
-                             __private const int in_height,
-                             __private const int in_width,
-                             __private const int out_height,
-                             __private const int out_width) {
-  int batch = get_global_id(0);
-  int out_chan_blk = get_global_id(1);
-  int out_pixel_blk = get_global_id(2);
-  const int in_pixel = in_height * in_width;
-  const int out_pixel = out_height * out_width;
-  const int round_out_width = (out_width + 3) / 4;
-  const int out_pixel_height = out_pixel_blk / round_out_width;
-  const int out_pixel_width = out_pixel_blk % round_out_width;
-  const int out_chan_begin = out_chan_blk * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
-  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-#ifdef STRIDE_1
-  const int stride = 1;
-#else
-  const int stride = 2;
-#endif
-  const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
-  const int in_offset = batch * in_chan_num * in_pixel;
-  const int out_offset = batch * out_chan_num * out_pixel;
-  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
-  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
-  int out_chan_len = out_chan_end - out_chan_begin;
-  int pixel_len = out_pixel_end - out_pixel_begin;
-  for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
-    DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-#ifdef BIAS
-    DATA_TYPE bias_value = bias[out_chan];
-#else
-    DATA_TYPE bias_value = 0;
-#endif
-    for (int p = 0; p < pixel_len; ++p) {
-      output_ptr[p] = bias_value;
-    }
-  }
-  int in_chan = 0;
-  if (pixel_len == 4) {
-    for (; in_chan + 3 < in_chan_num; in_chan += 4) {
-      const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
-      int out_chan = out_chan_begin;
-      for (; out_chan + 3 < out_chan_end; out_chan += 4) {
-        const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-        DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-#ifdef STRIDE_1
-        vec_conv_2d_1x1_s1;
-#else
-        vec_conv_2d_1x1_s2;
-#endif
-        vec_conv_2d_1x1_compute_loop;
-      }
-      for (; out_chan < out_chan_end; ++out_chan) {
-        const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-        DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-#ifdef STRIDE_1
-        vec_conv_2d_1x1_s1;
-#else
-        vec_conv_2d_1x1_s2;
-#endif
-        vec_conv_2d_1x1_compute;
-      }
-    }
-  }
-  for (; in_chan < in_chan_num; ++in_chan) {
-    const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
-    for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
-      DATA_TYPE weights = filter[out_chan * in_chan_num + in_chan];
-      DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-      for (int p = 0; p < pixel_len; ++p) {
-        float in = input_ptr[p*stride];
-        output_ptr[p] += in * weights;
-      }
-    }
-  }
-}
 __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
+#ifdef BIAS
                          __read_only image2d_t bias, /* cout%4 * cout/4 */
+#endif
                          __write_only image2d_t output,
+                          __private const int in_height,
+                          __private const int in_width,
                          __private const int in_ch_blks,
+                          __private const int height,
                          __private const int width) {
  const int out_ch_blk = get_global_id(0);
  const int out_w_blk = get_global_id(1);
@@ -154,151 +18,103 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  half4 bias_value = read_imageh(bias, sampler, (int2)(out_ch_blk, 0));
+#ifdef BIAS
-  half4 out[4];
+  DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
-  out[0] = (half4)(bias_value.x);
+  DATA_TYPE4 out1 = out0;
-  out[1] = (half4)(bias_value.y);
+  DATA_TYPE4 out2 = out0;
-  out[2] = (half4)(bias_value.z);
+  DATA_TYPE4 out3 = out0;
-  out[3] = (half4)(bias_value.w);
+#else
+  DATA_TYPE4 out0 = 0;
-  int w[4];
+  DATA_TYPE4 out1 = 0;
-  w[0] = out_w_blk;
+  DATA_TYPE4 out2 = 0;
-  w[1] = w[0] + out_w_blks;
+  DATA_TYPE4 out3 = 0;
-  w[2] = w[1] + out_w_blks;
+#endif
-  w[3] = w[2] + out_w_blks;
-  // Unrolling this loop hurt perfmance
-  int in_x_base = 0;
-  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    half4 in[4];
-    in[0] = read_imageh(input, sampler, (int2)(in_x_base + w[0], out_hb));
-    if (w[1] < width) {
-      // conditional load hurt perf, this branching helps sometimes
-      in[1] = read_imageh(input, sampler, (int2)(in_x_base + w[1], out_hb));
-      in[2] = read_imageh(input, sampler, (int2)(in_x_base + w[2], out_hb));
-      in[3] = read_imageh(input, sampler, (int2)(in_x_base + w[3], out_hb));
-    }
-    // The order matters, load input first then load filter, why?
-    const int filter_x0 = in_ch_blk << 2;
-    half4 weights[4];
-    #pragma unroll
-    for (int c = 0; c < 4; ++c) {
-      weights[c] = read_imageh(filter, sampler, (int2)(filter_x0 + c, out_ch_blk));
-    }
-    // Will prefetch L2 improve performance? How to pretch image data?
-    // Interleaving load and mul does not improve performance as expected
-    #pragma unroll
-    for (int c = 0; c < 4; ++c) {
-      out[c] += in[c].x * weights[0];
-      out[c] += in[c].y * weights[1];
-      out[c] += in[c].z * weights[2];
-      out[c] += in[c].w * weights[3];
-    }
-    in_x_base += width;
-  }
-  const int out_x_base = out_ch_blk * width;
-  write_imageh(output, (int2)(out_x_base + w[0], out_hb), out[0]);
-  if (w[1] >= width) return;
-  write_imageh(output, (int2)(out_x_base + w[1], out_hb), out[1]);
-  if (w[2] >= width) return;
-  write_imageh(output, (int2)(out_x_base + w[2], out_hb), out[2]);
-  if (w[3] >= width) return;
-  write_imageh(output, (int2)(out_x_base + w[3], out_hb), out[3]);
-}
-__kernel void conv_2d_1x1_h8(__read_only image2d_t input, /* [c%8 * w * c/8, h * b] */
-                             __read_only image2d_t filter, /* cout%8 * cin, cout/8 */
-                             __read_only image2d_t bias, /* cout%8 * cout/8 */
-                             __write_only image2d_t output,
-                             __private const int in_ch_blks,
-                             __private const int width) {
-  const int out_ch_blk = get_global_id(0);
-  const int out_w_blk = get_global_id(1);
-  const int out_w_blks = get_global_size(1);
-  const int out_hb = get_global_id(2);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int4 w;
+#if STRIDE == 1
+  w.x = out_w_blk;
+  w.y = w.x + out_w_blks;
+  w.z = w.y + out_w_blks;
+  w.w = w.z + out_w_blks;
+  int out_hb_idx = (out_hb % height);
+#else
+  w.x = out_w_blk * 2;
+  w.y = (out_w_blk + out_w_blks) * 2;
+  w.z = (out_w_blk + 2 * out_w_blks) * 2;
+  w.w = (out_w_blk + 3 * out_w_blks) * 2;
+  int out_hb_idx = (out_hb % height) * 2;
+#endif
-  float4 bias_value = read_imagef(bias, sampler, (int2)(out_ch_blk, 0));
+  w.x = select(w.x, INT_MIN, w.x >= in_width);
-  half4 bias_value03 = as_half4(bias_value.xy);
+  w.y = select(w.y, INT_MIN, w.y >= in_width);
-  half4 bias_value47 = as_half4(bias_value.zw);
+  w.z = select(w.z, INT_MIN, w.z >= in_width);
-  half4 out[8];
+  w.w = select(w.w, INT_MIN, w.w >= in_width);
-  out[0] = (half4)(bias_value03.x);
-  out[1] = (half4)(bias_value03.y);
-  out[2] = (half4)(bias_value03.z);
-  out[3] = (half4)(bias_value03.w);
-  out[4] = (half4)(bias_value47.x);
-  out[5] = (half4)(bias_value47.y);
-  out[6] = (half4)(bias_value47.z);
-  out[7] = (half4)(bias_value47.w);
-  int w[4];
+  out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
-  w[0] = out_w_blk;
+                      -1,
-  w[1] = w[0] + out_w_blks;
+                      out_hb_idx >= in_height);
-  w[2] = w[1] + out_w_blks;
-  w[3] = w[2] + out_w_blks;
  // Unrolling this loop hurt perfmance
  int in_x_base = 0;
  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    half4 in[8];
-    #pragma unroll
-    for (int wi = 0; wi < 4; ++wi) {
-      float4 in_value = read_imagef(input, sampler, (int2)(in_x_base + w[0], out_hb));
-      in[wi << 1] = as_half4(in_value.xy);
-      in[wi << 1 + 1] = as_half4(in_value.zw);
-    }
-    // The order matters, load input first then load filter, why?
+    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
+    DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
+    DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
+    DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
    const int filter_x0 = in_ch_blk << 2;
-    half4 weights[8];
+    DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
-    #pragma unroll
+    DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
-    for (int wi = 0; wi < 4; ++wi) {
+    DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
-      float4 weights_value = read_imagef(filter, sampler, (int2)(filter_x0 + wi, out_ch_blk));
+    DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
-      weights[wi << 1] = as_half4(weights_value.xy);
-      weights[wi << 1 + 1] = as_half4(weights_value.zw);
-    }
    // Will prefetch L2 improve performance? How to pretch image data?
-    // Interleaving load and mul does not improve performance as expected
+    out0 += in0.x * weights0;
-    #pragma unroll
+    out0 += in0.y * weights1;
-    for (int wi = 0; wi < 4; ++wi) {
+    out0 += in0.z * weights2;
-      int idx = wi << 1;
+    out0 += in0.w * weights3;
-      out[idx] += in[idx].x * weights[0];
-      out[idx] += in[idx].y * weights[1];
+    out1 += in1.x * weights0;
-      out[idx] += in[idx].z * weights[2];
+    out1 += in1.y * weights1;
-      out[idx] += in[idx].w * weights[3];
+    out1 += in1.z * weights2;
+    out1 += in1.w * weights3;
-      ++idx;
+    out2 += in2.x * weights0;
-      out[idx] += in[idx].x * weights[4];
+    out2 += in2.y * weights1;
-      out[idx] += in[idx].y * weights[5];
+    out2 += in2.z * weights2;
-      out[idx] += in[idx].z * weights[6];
+    out2 += in2.w * weights3;
-      out[idx] += in[idx].w * weights[7];
-    }
-    in_x_base += width;
+    out3 += in3.x * weights0;
+    out3 += in3.y * weights1;
+    out3 += in3.z * weights2;
+    out3 += in3.w * weights3;
+    in_x_base += in_width;
  }
+#ifdef FUSED_RELU
+  // TODO relux
+  out0 = fmax(out0, 0);
+  out1 = fmax(out1, 0);
+  out2 = fmax(out2, 0);
+  out3 = fmax(out3, 0);
+#endif
  const int out_x_base = out_ch_blk * width;
-  float4 out_value = (float4)(as_float2(out[0]), as_float2(out[1]));
+  int out_x_idx = out_w_blk;
-  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1);
-  if (w[1] >= width) return;
+  out_x_idx += out_w_blks;
-  out_value = (float4)(as_float2(out[2]), as_float2(out[3]));
+  if (out_x_idx >= width) return;
-  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2);
-  if (w[2] >= width) return;
+  out_x_idx += out_w_blks;
-  out_value = (float4)(as_float2(out[4]), as_float2(out[5]));
+  if (out_x_idx >= width) return;
-  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
-  if (w[3] >= width) return;
-  out_value = (float4)(as_float2(out[6]), as_float2(out[7]));
-  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
 }
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                          __write_only image2d_t output,
                          __private const int in_height,
                          __private const int in_width,
-                          __private const int in_channels,
+                          __private const int in_ch_blks,
                          __private const int out_height,
                          __private const int out_width,
                          __private const int padding_top,
@@ -17,120 +17,145 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const int out_w_blk = get_global_id(1);
  const int out_w_blks = get_global_size(1);
  const int out_hb = get_global_id(2);
-  const int in_ch_blks = (in_channels + 3) / 4;
  const int rounded_in_ch = in_ch_blks * 4;
  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  VEC_DATA_TYPE(DATA_TYPE, 4) out[4] = {0};
 #ifdef BIAS
-  out[0] =
+  DATA_TYPE4 out0 =
-      CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0));
+     READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
-  out[1] = out[0];
+  DATA_TYPE4 out1 = out0;
-  out[2] = out[0];
+  DATA_TYPE4 out2 = out0;
-  out[3] = out[0];
+  DATA_TYPE4 out3 = out0;
+  DATA_TYPE4 out4 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+  DATA_TYPE4 out4 = 0;
+#endif
+#if STRIDE == 1
+  int in_width0 = out_w_blk - padding_left;
+  int in_width1 = in_width0 + out_w_blks;
+  int in_width2 = in_width1 + out_w_blks;
+  int in_width3 = in_width2 + out_w_blks;
+  int in_width4 = in_width3 + out_w_blks;
+  const int height_idx = (out_hb % out_height) - padding_top;
+#else
+  int in_width0 = out_w_blk * 2 - padding_left;
+  int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
+  int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
+  int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
+  int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left;
+  const int height_idx = (out_hb % out_height) * 2 - padding_top;
 #endif
-  int w[4];
+  const int batch_idx = (out_hb / out_height) * in_height;
-  w[0] = out_w_blk - padding_left;
-  w[1] = w[0] + out_w_blks;
-  w[2] = w[1] + out_w_blks;
-  w[3] = w[2] + out_w_blks;
-  const int batch_idx = out_hb / out_height;
-  const int height_idx = out_hb % out_height;
-  int in_hb[3];
-  in_hb[0] = height_idx - padding_top;
-  in_hb[1] = in_hb[0] + 1;
-  in_hb[2] = in_hb[1] + 1;
-  // Judge the height border for padding input.
-  in_hb[0] = (in_hb[0] < 0 || in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height;
-  in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height;
-  in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height;
-  const int input_image_width = in_ch_blks * in_width;
+  DATA_TYPE4 in0, in1, in2, in3, in4;
+  DATA_TYPE4 weights0, weights1, weights2, weights3;
+  int in_idx, hb_idx, width_idx, in_width_idx;
  // Unrolling this loop hurt perfmance
-  int idx = 0;
+  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
-    VEC_DATA_TYPE(DATA_TYPE, 4) in[36];
+      int in_hb_value = height_idx + hb_idx;
-    VEC_DATA_TYPE(DATA_TYPE, 4) weights[36];
+      in_hb_value = select(in_hb_value + batch_idx,
+                           -1,
-    int filter_idx = in_ch_blk << 2;
+                           (in_hb_value < 0 || in_hb_value >= in_height));
-    int in_idx = in_ch_blk * in_width;
+      for (short width_idx = 0; width_idx < 3; ++width_idx) {
-    #pragma unroll
+        in_idx = in_ch_blk * in_width;
-    for (int i = 0; i < 3; ++i) {
+        int in_width_value;
-      for (int j = 0; j < 3; ++j) {
+#define READ_INPUT(i)                                                                \
-        idx = i * 12 + j * 4;
+        in_width_value = in_width##i + width_idx;                                    \
-        int in_width_idx = w[0] + j;
+        in_width_value = select(in_idx + in_width_value,                             \
-        // Judge the width border for padding input.
+                                -1,                                                  \
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
+                                (in_width_value < 0 || in_width_value >= in_width)); \
-          in[idx + 0] = 0;
+        in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
-        } else {
-          in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
+        READ_INPUT(0);
-        }
+        READ_INPUT(1);
-        in_width_idx = w[1] + j;
+        READ_INPUT(2);
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
+        READ_INPUT(3);
-          in[idx + 1] = 0;
+        READ_INPUT(4);
-        } else {
-          in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
+#undef READ_INPUT
-        }
-        in_width_idx = w[2] + j;
+        int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
+        weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-          in[idx + 2] = 0;
+        weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
-        } else {
+        weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
-          in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
+        weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
-        }
-        in_width_idx = w[3] + j;
+        // Will prefetch L2 improve performance? How to pretch image data?
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 3] = 0;
+        // Interleaving load and mul does not improve performance as expected
-        } else {
+        out0 += in0.x * weights0;
-          in[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
+        out0 += in0.y * weights1;
-        }
+        out0 += in0.z * weights2;
+        out0 += in0.w * weights3;
-        weights[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-        weights[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
+        out1 += in1.x * weights0;
-        weights[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
+        out1 += in1.y * weights1;
-        weights[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+        out1 += in1.z * weights2;
+        out1 += in1.w * weights3;
-        filter_idx += rounded_in_ch;
-      }
+        out2 += in2.x * weights0;
-    }
+        out2 += in2.y * weights1;
-    // Will prefetch L2 improve performance? How to pretch image data?
+        out2 += in2.z * weights2;
+        out2 += in2.w * weights3;
-    // Interleaving load and mul does not improve performance as expected
-    #pragma unroll
+        out3 += in3.x * weights0;
-    for (int c = 0; c < 4; ++c) {
+        out3 += in3.y * weights1;
-      for (int i = 0; i < 9; ++i) {
+        out3 += in3.z * weights2;
-        out[c] += in[c + i * 4].x * weights[0 + i * 4];
+        out3 += in3.w * weights3;
-        out[c] += in[c + i * 4].y * weights[1 + i * 4];
-        out[c] += in[c + i * 4].z * weights[2 + i * 4];
+        out4 += in4.x * weights0;
-        out[c] += in[c + i * 4].w * weights[3 + i * 4];
+        out4 += in4.y * weights1;
+        out4 += in4.z * weights2;
+        out4 += in4.w * weights3;
      }
    }
  }
+#ifdef FUSED_RELU
+  // TODO relux
+  out0 = fmax(out0, 0);
+  out1 = fmax(out1, 0);
+  out2 = fmax(out2, 0);
+  out3 = fmax(out3, 0);
+  out4 = fmax(out4, 0);
+#endif
  const int out_x_base = out_ch_blk * out_width;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
+  int w = out_w_blk;
-                                       (int2)(out_x_base + w[0] + padding_left, out_hb),
+  WRITE_IMAGET(output,
-                                       out[0]);
+               (int2)(out_x_base + w, out_hb),
+               out0);
-  w[1] += padding_left;
-  if (w[1] >= out_width) return;
+  w += out_w_blks;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
+  if (w >= out_width) return;
-                                       (int2)(out_x_base + w[1], out_hb),
+  WRITE_IMAGET(output,
-                                       out[1]);
+               (int2)(out_x_base + w, out_hb),
+               out1);
-  w[2] += padding_left;
-  if (w[2] >= out_width) return;
+  w += out_w_blks;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
+  if (w >= out_width) return;
-                                       (int2)(out_x_base + w[2], out_hb),
+  WRITE_IMAGET(output,
-                                       out[2]);
+               (int2)(out_x_base + w, out_hb),
+               out2);
-  w[3] += padding_left;
-  if (w[3] >= out_width) return;
+  w += out_w_blks;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
+  if (w >= out_width) return;
-                                       (int2)(out_x_base + w[3], out_hb),
+  WRITE_IMAGET(output,
-                                       out[3]);
+               (int2)(out_x_base + w, out_hb),
+               out3);
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out4);
 }
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
 #include <common.h>
-VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) {
+#ifdef FP16
-  VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr);
+#define MIN_VALUE -USHRT_MAX
-  VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4);
-  VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width);
-  VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4);
-  VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2);
-  VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4);
-  VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323);
-  VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01);
-  VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323);
-  VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01);
-  VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323);
-  VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01);
-  VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
-  VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
-  VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
-                                        (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
-  res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
-  return res;
-}
-VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) {
-  VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr);
-  DATA_TYPE row01 = *(input_ptr + 8);
-  VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width);
-  DATA_TYPE row11 = *(input_ptr + in_width + 8);
-  VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2);
-  DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8);
-  VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445);
-  VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01);
-  VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445);
-  VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11);
-  VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445);
-  VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21);
-  VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
-  VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
-  VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
-                                        (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
-  res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
-  return res;
-}
-DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) {
-  VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr);
-  VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width);
-  VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2);
-  VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2);
-  DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2);
-  return res;
-}
-// Supported data type: half/float
-__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */
-                       __private const int in_height,
-                       __private const int in_width,
-                       __private const int out_chan_num,
-                       __private const int out_height,
-                       __private const int out_width,
-                       __private const int stride,
-                       __global DATA_TYPE *output) {
-  int batch = get_global_id(0);
-  int out_chan_blk = get_global_id(1);
-  int out_pixel_blk = get_global_id(2);
-  const int round_out_width = (out_width + 3) / 4;
-  const int out_pixel_height = out_pixel_blk / round_out_width;
-  const int out_pixel_width = out_pixel_blk % round_out_width;
-  const int out_chan_begin = out_chan_blk * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
-  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-  const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
-  const int in_pixel = in_height * in_width;
-  const int out_pixel = out_height * out_width;
-  const int in_offset = batch * out_chan_num * in_pixel;
-  const int out_offset = batch * out_chan_num * out_pixel;
-  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
-  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
-  const int pixels = out_pixel_end - out_pixel_begin;
-  for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    const DATA_TYPE *input_ptr = input_base + i * in_pixel;
-    DATA_TYPE *output_ptr = output_base + i * out_pixel;
-    if (pixels == 4) {
-      VEC_DATA_TYPE(DATA_TYPE,4) res;
-#ifdef STRIDE_1
-      res = vec_pooling_3_s1(input_ptr, in_width);
 #else
-      res = vec_pooling_3_s2(input_ptr, in_width);
+#define MIN_VALUE -FLT_MAX
 #endif
-      vstore4(res, 0, output_ptr);
-    } else {
-      for (int p = 0; p < pixels; ++p) {
-        output_ptr[p] = inner_pooling_3(input_ptr, in_width);
-        input_ptr += stride;
-      }
-    }
-  }
-}
-int calculate_avg_block_size(const int pos_h,
+inline int calculate_avg_block_size(const int pool_size,
-                             const int pos_w,
+                                    const int pos_h,
-                             const int pool_size,
+                                    const int pos_w,
-                             const int pad_h,
+                                    const int h_size,
-                             const int pad_w,
+                                    const int w_size) {
-                             const int h_size,
+  const int h_start = max(0, pos_h);
-                             const int w_size) {
+  const int w_start = max(0, pos_w);
-  const int h_start = max(0, pos_h - pad_h);
+  const int h_end = min(pos_h + pool_size, h_size);
-  const int w_start = max(0, pos_w - pad_w);
+  const int w_end = min(pos_w + pool_size, w_size);
-  const int h_end = min(pos_h + pool_size - pad_h, h_size);
-  const int w_end = min(pos_w + pool_size - pad_w, w_size);
  return (h_end - h_start) * (w_end - w_start);
 }
 // Supported data type: half/float
-__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */
+__kernel void pooling(__read_only image2d_t input,
-                       __private const int in_height,
+                      __private const int in_height,
-                       __private const int in_width,
+                      __private const int in_width,
-                       __private const int out_chan_num,
+                      __private const int out_height,
-                       __private const int out_height,
+                      __private const int pad_top,
-                       __private const int out_width,
+                      __private const int pad_left,
-                       __private const int stride,
+                      __private const int stride,
-                       __private const int pad_h,
+                      __private const int pooling_size,
-                       __private const int pad_w,
+                      __write_only image2d_t output) {
-                       __private const int pooling_size,
+  const int out_chan_idx = get_global_id(0);
-                       __global DATA_TYPE *output) {
+  const int out_width_idx = get_global_id(1);
-  int batch = get_global_id(0);
+  const int out_width = get_global_size(1);
-  int out_chan_idx = get_global_id(1);
+  const int out_hb_idx = get_global_id(2);
-  int out_pixel_idx = get_global_id(2);
+  const int batch_idx = (out_hb_idx / out_height) * in_height;
-  const int out_pixel_height = out_pixel_idx / out_width;
+  const int in_height_start = (out_hb_idx % out_height) * stride - pad_top;
-  const int out_pixel_width = out_pixel_idx % out_width;
+  const int in_width_start = out_width_idx * stride - pad_left;
+  const int in_channel_offset = out_chan_idx * in_width;
-  const int out_chan_begin = out_chan_idx * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int in_pixel_idx = out_pixel_height * stride * in_width
+#ifdef POOL_AVG
-                             + out_pixel_width * stride;
+  DATA_TYPE4 res = 0;
+  for (int height = 0; height < pooling_size; ++height) {
-  const int in_pixel = in_height * in_width;
+    int in_height_idx = in_height_start + height;
-  const int out_pixel = out_height * out_width;
+    in_height_idx = select(batch_idx + in_height_idx,
+                       -1,
-  const int in_offset = batch * out_chan_num * in_pixel;
+                       (in_height_idx < 0 || in_height_idx >= in_height));
-  const int out_offset = batch * out_chan_num * out_pixel;
+    for (int width = 0; width < pooling_size; ++width) {
-  const DATA_TYPE *input_base = input + in_offset + in_pixel_idx;
+      int in_width_idx = in_width_start + width;
-  DATA_TYPE *output_base = output + out_offset + out_pixel_idx;
+      in_width_idx = select(in_channel_offset + in_width_idx,
+                            -1,
-  const int block_size = calculate_avg_block_size(
+                            (in_width_idx < 0 || in_width_idx >= in_width));
-                            out_pixel_height * stride,
-                            out_pixel_width * stride,
+      DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
-                            pooling_size,
+      res = res + in;
-                            pad_h/2,
+    }
-                            pad_w/2,
+  }
-                            in_height - pad_h,
+  const int block_size = calculate_avg_block_size(pooling_size,
-                            in_width - pad_w);
+                                                  in_height_start, in_width_start,
-  for (int i = out_chan_begin; i < out_chan_end; ++i) {
+                                                  in_height, in_width);
-    VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f;
+  res /= block_size;
-    DATA_TYPE sum1 = 0.0f;
+#else
-    DATA_TYPE *output_ptr = output_base + i * out_pixel;
+  DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE);
-    for (int y = 0; y < pooling_size; ++y) {
+  for (int height = 0; height < pooling_size; ++height) {
-      const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width;
+    int in_height_idx = in_height_start + height;
-      int x = 0;
+    in_height_idx = select(batch_idx + in_height_idx,
-      for (; x < (pooling_size-8); x += 8) {
+                           -1,
-        VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr);
+                           (in_height_idx < 0 || in_height_idx >= in_height));
-        sum8 += data;
+    if (in_height_idx != -1) {
-        input_ptr += 8;
+      for (int width = 0; width < pooling_size; ++width) {
-      }
+        int in_width_idx = in_width_start + width;
-      for (; x < pooling_size; ++x) {
+        in_width_idx = select(in_channel_offset + in_width_idx,
-        sum1 += *input_ptr;
+                              -1,
-        input_ptr++;
+                              (in_width_idx < 0 || in_width_idx >= in_width));
+        if (in_width_idx != -1) {
+          DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
+          res = fmax(res, in);
+        }
      }
    }
-    VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567;
-    VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23;
-    *output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size;
  }
+#endif
+  WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res);
 }
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
 #include <common.h>
-// Supported data type: half/float
+__kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
-__kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, h, w */
+                                      __write_only image2d_t output,
-                                      __global DATA_TYPE *output /* n * c, h, w */,
                                      __private const float height_scale,
                                      __private const float width_scale,
                                      __private const int in_height,
-                                      __private const int in_width) {
+                                      __private const int in_width,
-  const int c = get_global_id(0);
+                                      __private const int out_height) {
-  const int h = get_global_id(1);
+  const int ch_blk = get_global_id(0);
-  const int w = get_global_id(2);
+  const int ch_blks = get_global_size(0);
-  const int channels = get_global_size(0);
+  const int w = get_global_id(1);
-  const int height = get_global_size(1);
+  const int out_width = get_global_size(1);
-  const int width = get_global_size(2);
+  const int hb = get_global_id(2);
+  const int b = hb / out_height;
+  const int h = hb % out_height;
  const float h_in = h * height_scale;
  const float w_in = w * width_scale;
@@ -24,16 +25,26 @@ __kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c,
  const float h_lerp = h_in - h_lower;
  const float w_lerp = w_in - w_lower;
-  const DATA_TYPE *input_base = input + c * in_height * in_width;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  DATA_TYPE *output_base = output + c * height * width;
+  const int in_w_offset = ch_blk * in_width;
+  const int in_h_offset = b * in_height;
-  DATA_TYPE top_left = input_base[h_lower * in_width + w_lower];
+  DATA_TYPE4 top_left = READ_IMAGET(input, sampler,
-  DATA_TYPE top_right = input_base[h_lower * in_width + w_upper];
+          (int2)(in_w_offset + w_lower, in_h_offset + h_lower));
-  DATA_TYPE bottom_left = input_base[h_upper * in_width + w_lower];
+  DATA_TYPE4 top_right = READ_IMAGET(input, sampler,
-  DATA_TYPE bottom_right = input_base[h_upper * in_width + w_upper];
+          (int2)(in_w_offset + w_upper, in_h_offset + h_lower));
+  DATA_TYPE4 bottom_left = READ_IMAGET(input, sampler,
+          (int2)(in_w_offset + w_lower, in_h_offset + h_upper));
+  DATA_TYPE4 bottom_right = READ_IMAGET(input, sampler,
+          (int2)(in_w_offset + w_upper, in_h_offset + h_upper));
-  const DATA_TYPE top = top_left + (top_right - top_left) * w_lerp;
+  DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp;
-  const DATA_TYPE bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
+  DATA_TYPE4 bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
-  output_base[h * width + w] = top + (bottom - top) * h_lerp;
+  DATA_TYPE4 out = top + (bottom - top) * h_lerp;
+  const int out_w_offset = ch_blk * out_width;
+  const int out_h_offset = b * out_height;
+  WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
 }
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -9,50 +9,56 @@ namespace mace {
 namespace kernels {
 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
-                               const Tensor *bias, const int *padding,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
                               Tensor *output);
 extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
-                               const Tensor *bias, const int *padding,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
                               Tensor *output);
 extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
-                               const Tensor *bias, const int *padding,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
                               Tensor *output);
 extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
-                               const Tensor *bias, const int *padding,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
                               Tensor *output);
-template <>
+extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
-void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
+                         const Tensor *bias, const bool fused_relu,
-                                                          const Tensor *filter,
+                         const uint32_t stride, const int *padding,
-                                                          const Tensor *bias,
+                         const DataType dt, Tensor *output);
-                                                          Tensor *output) {
+template<typename T>
+void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+                                                      const Tensor *filter,
+                                                      const Tensor *bias,
+                                                      Tensor *output) {
  typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
-                                       const Tensor *bias, const int *padding,
+                                       const Tensor *bias, const bool fused_relu,
+                                       const int *padding, const DataType dt,
                                       Tensor *output);
  // Selection matrix: kernel_size x stride_size
  static const Conv2dOpenclFunction selector[5][2] = {
      {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
      {nullptr, nullptr},
-      {Conv2dOpenclK3x3S1, nullptr},
+      {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
      {nullptr, nullptr},
      {nullptr, nullptr}};
  index_t kernel_h = filter->dim(0);
  index_t kernel_w = filter->dim(1);
-  if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
+  if (!input->is_image() || strides_[0] != strides_[1] ||
-      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
+      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) {
-      selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
    LOG(WARNING) << "OpenCL conv2d kernel with "
                 << "filter" << kernel_h << "x" << kernel_w << ","
                 << " stride " << strides_[0] << "x" << strides_[1]
                 << " is not implemented yet, using slow version";
-    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
+    MACE_NOT_IMPLEMENTED;
-    Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
-        input, filter, bias, output);
-    return;
  }
  std::vector<index_t> output_shape(4);
@@ -61,17 +67,24 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
      input->shape().data(), filter->shape().data(), dilations_,
      strides_, paddings_, output_shape.data(), paddings.data());
-  if (input->is_image()) {
+  std::vector<size_t> output_image_shape;
-    std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
-    CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+  output->ResizeImage(output_shape, output_image_shape);
-    output->ResizeImage(output_shape, output_image_shape);
+  if (kernel_h == kernel_w && kernel_h <= 5 &&
+      selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
+    auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
+    conv2d_func(input, filter, bias, false, paddings.data(), DataTypeToEnum<T>::value, output);
  } else {
-    output->Resize(output_shape);
+    Conv2dOpencl(input, filter, bias, false, strides_[0], paddings.data(), DataTypeToEnum<T>::value, output);
  }
-  auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-  conv2d_func(input, filter, bias, paddings.data(), output);
 }
+template
+struct Conv2dFunctor<DeviceType::OPENCL, float>;
+template
+struct Conv2dFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -5,83 +5,44 @@
 #include "mace/kernels/conv_2d.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/utils/utils.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-void Conv1x1V2(const Tensor *input,
+void Conv1x1(const Tensor *input,
-               const Tensor *filter,
+             const Tensor *filter,
-               const Tensor *bias,
+             const Tensor *bias,
-               const int stride,
+             const bool fused_relu,
-               Tensor *output) {
+             const int stride,
+             const DataType dt,
+             Tensor *output) {
  const index_t batch = output->dim(0);
-  const index_t channels = output->dim(1);
+  const index_t height = output->dim(1);
-  const index_t height = output->dim(2);
+  const index_t width = output->dim(2);
-  const index_t width = output->dim(3);
+  const index_t channels = output->dim(3);
-  const index_t input_channels = input->dim(1);
+  const index_t input_batch = input->dim(0);
+  const index_t input_height = input->dim(1);
-  auto runtime = OpenCLRuntime::Get();
+  const index_t input_width = input->dim(2);
-  auto program = runtime->program();
+  const index_t input_channels = input->dim(3);
-  const index_t channel_blocks = (channels + 3) / 4;
-  const index_t pixel_blocks = (width + 3) / 4 * height;
-  // TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
-  // TODO check wired clReleaseCommandQueue latency
-  // The KernelFunctor can cause segment faults in cb_retain_event
-  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
-  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-  auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1_v2", built_options);
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
-  uint32_t idx = 0;
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Buffer *>(input->buffer())));
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Buffer *>(filter->buffer())));
-  if (bias != nullptr) {
-    conv_2d_kernel.setArg(idx++,
-                          *(static_cast<const cl::Buffer *>(bias->buffer())));
-  }
-  conv_2d_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
-  auto command_queue = runtime->command_queue();
-  cl_int error = command_queue.enqueueNDRangeKernel(
-      conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<int>(batch), static_cast<int>(channel_blocks),
-                  static_cast<int>(pixel_blocks)),
-      cl::NDRange(1, 2, kwg_size / 2),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS, error);
-}
-void Conv1x1V3(const Tensor *input,
-               const Tensor *filter,
-               const Tensor *bias,
-               const int stride,
-               Tensor *output) {
-  const index_t batch = output->dim(0);
-  const index_t channels = output->dim(1);
-  const index_t height = output->dim(2);
-  const index_t width = output->dim(3);
-  const index_t input_channels = input->dim(1);
  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_blocks = RoundUpDiv4(width);
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
+  MACE_CHECK(input_batch == batch);
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DSTRIDE_1");
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
+  if (bias != nullptr) {
+    built_options.emplace("-DBIAS");
+  }
+  if (fused_relu) {
+    built_options.emplace("-DFUSED_RELU");
+  }
  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();
@@ -96,47 +57,42 @@ void Conv1x1V3(const Tensor *input,
    conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
  }
  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_height));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_width));
  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
  auto command_queue = runtime->command_queue();
  cl_int error;
  error = command_queue.enqueueNDRangeKernel(
      conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(height),
+      cl::NDRange(static_cast<uint32_t>(channel_blocks),
+                  static_cast<uint32_t>(width_blocks),
                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(4, 15, 8),
+      cl::NDRange(4, 15, 8), // TODO auto tuning
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }
 extern void Conv2dOpenclK1x1S1(const Tensor *input,
                               const Tensor *filter,
                               const Tensor *bias,
+                               const bool fused_relu,
                               const int *padding,
+                               const DataType dt,
                               Tensor *output) {
-  const index_t batch = output->dim(0);
+  Conv1x1(input, filter, bias, fused_relu, 1, dt, output);
-  const index_t height = output->dim(2);
-  const index_t width = output->dim(3);
-  const index_t input_batch = input->dim(0);
-  const index_t input_height = input->dim(2);
-  const index_t input_width = input->dim(3);
-  MACE_CHECK(input_batch == batch && input_height == height &&
-             input_width == width);
-  Conv1x1V2(input, filter, bias, 1, output);
 };
 extern void Conv2dOpenclK1x1S2(const Tensor *input,
                               const Tensor *filter,
                               const Tensor *bias,
+                               const bool fused_relu,
                               const int *padding,
+                               const DataType dt,
                               Tensor *output) {
-  MACE_CHECK(input->dim(0) == output->dim(0));
+  Conv1x1(input, filter, bias, fused_relu, 2, dt, output);
-  Conv1x1V2(input, filter, bias, 2, output);
 };
 }  // namespace kernels

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace kernels {
 static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
-                         const Tensor *bias, const uint32_t stride,
+                         const Tensor *bias, const bool fused_relu,
-                         const int *padding, Tensor *output) {
+                         const uint32_t stride, const int *padding,
+                         const DataType dt, Tensor *output) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -22,18 +23,21 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
  const index_t channel_blocks = RoundUpDiv4(channels);
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
-  const index_t width_blocks = RoundUpDiv4(width);
+  const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
+  if (fused_relu) {
+    built_options.emplace("-DFUSED_RELU");
+  }
  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();
  auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
  uint32_t idx = 0;
  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
@@ -44,7 +48,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
  conv_2d_kernel.setArg(idx++, padding[0] / 2);
@@ -56,18 +60,29 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
      conv_2d_kernel, cl::NullRange,
      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(4, 15, 8),
+      cl::NDRange(16, 16, 4),
      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }
-void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
+void Conv2dOpenclK3x3S1(const Tensor *input,
-                        const Tensor *bias, const int *padding, Tensor *output) {
+                        const Tensor *filter,
-  Conv2d3x3S12(input, filter, bias, 1, padding, output);
+                        const Tensor *bias,
+                        const bool fused_relu,
+                        const int *padding,
+                        const DataType dt,
+                        Tensor *output) {
+  Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output);
 };
-void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+void Conv2dOpenclK3x3S2(const Tensor *input,
-                        const Tensor *bias, const int *padding, Tensor *output) {
+                        const Tensor *filter,
+                        const Tensor *bias,
+                        const bool fused_relu,
+                        const int *padding,
+                        const DataType dt,
+                        Tensor *output) {
+  Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output);
 };
 }  // namespace kernels

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#include "mace/core/common.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
+namespace mace {
+namespace kernels {
+void Conv2dOpencl(const Tensor *input, const Tensor *filter,
+                  const Tensor *bias, const bool fused_relu,
+                  const uint32_t stride, const int *padding,
+                  const DataType dt, Tensor *output) {
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+  const index_t input_channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
+  const index_t width_blocks = RoundUpDiv4(width);
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
+  if (fused_relu) {
+    built_options.emplace("-DFUSED_RELU");
+  }
+  auto runtime = OpenCLRuntime::Get();
+  auto program = runtime->program();
+  auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options);
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
+  uint32_t idx = 0;
+  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
+  if (bias != nullptr) {
+    conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+  }
+  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(0)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(1)));
+  conv_2d_kernel.setArg(idx++, padding[0] / 2);
+  conv_2d_kernel.setArg(idx++, padding[1] / 2);
+  auto command_queue = runtime->command_queue();
+  cl_int error;
+  error = command_queue.enqueueNDRangeKernel(
+      conv_2d_kernel, cl::NullRange,
+      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
+                  static_cast<uint32_t>(height * batch)),
+      cl::NDRange(16, 16, 4),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+  MACE_CHECK(error == CL_SUCCESS, error);
+}
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -32,7 +32,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
  auto conv_kernel  = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options);

--- a/mace/kernels/opencl/fused_conv_2d_opencl.cc
+++ b/mace/kernels/opencl/fused_conv_2d_opencl.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#include "mace/kernels/fused_conv_2d.h"
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
+                               Tensor *output);
+extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
+                               Tensor *output);
+extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
+                               Tensor *output);
+extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
+                               Tensor *output);
+template<typename T>
+void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+                                                           const Tensor *filter,
+                                                           const Tensor *bias,
+                                                           Tensor *output) {
+  typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
+                                       const Tensor *bias, const bool fused_relu,
+                                       const int *padding, const DataType dt,
+                                       Tensor *output);
+  // Selection matrix: kernel_size x stride_size
+  static const Conv2dOpenclFunction selector[5][2] = {
+      {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
+      {nullptr, nullptr},
+      {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
+      {nullptr, nullptr},
+      {nullptr, nullptr}};
+  index_t kernel_h = filter->dim(0);
+  index_t kernel_w = filter->dim(1);
+  if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
+      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
+      selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
+    LOG(WARNING) << "OpenCL conv2d kernel with "
+                 << "filter" << kernel_h << "x" << kernel_w << ","
+                 << " stride " << strides_[0] << "x" << strides_[1]
+                 << " is not implemented yet, using slow version";
+    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
+    FusedConv2dFunctor<DeviceType::CPU, T>(strides_, paddings_, dilations_)(
+        input, filter, bias, output);
+    return;
+  }
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  kernels::CalcNHWCPaddingAndOutputSize(
+      input->shape().data(), filter->shape().data(), dilations_,
+      strides_, paddings_, output_shape.data(), paddings.data());
+  if (input->is_image()) {
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+    output->ResizeImage(output_shape, output_image_shape);
+  } else {
+    output->Resize(output_shape);
+  }
+  auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
+  conv2d_func(input, filter, bias, true, paddings.data(), DataTypeToEnum<T>::value, output);
+}
+template
+struct FusedConv2dFunctor<DeviceType::OPENCL, float>;
+template
+struct FusedConv2dFunctor<DeviceType::OPENCL, half>;
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -54,35 +54,19 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
 }
-std::string DataTypeToCLType(const DataType dt) {
+std::string DtToCLDt(const DataType dt) {
  switch (dt) {
    case DT_FLOAT:
      return "float";
    case DT_HALF:
      return "half";
-    case DT_UINT8:
-      return "uchar";
-    case DT_INT8:
-      return "char";
-    case DT_DOUBLE:
-      return "double";
-    case DT_INT32:
-      return "int";
-    case DT_UINT32:
-      return "int";
-    case DT_UINT16:
-      return "ushort";
-    case DT_INT16:
-      return "short";
-    case DT_INT64:
-      return "long";
    default:
      LOG(FATAL) << "Unsupported data type";
      return "";
  }
 }
-std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
+std::string DtToCLCMDDt(const DataType dt) {
  switch (dt) {
    case DT_FLOAT:
      return "f";
@@ -94,5 +78,27 @@ std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
  }
 }
+std::string DtToUpstreamCLDt(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+    case DT_HALF:
+      return "float";
+    default:
+      LOG(FATAL) << "Unsupported data type";
+      return "";
+  }
+}
+std::string DtToUpstreamCLCMDDt(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+    case DT_HALF:
+      return "f";
+    default:
+      LOG(FATAL) << "Not supported data type for opencl cmd data type";
+      return "";
+  }
+}
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -19,10 +19,13 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,
                     std::vector<size_t> &image_shape);
-std::string DataTypeToOPENCLCMDDataType(const DataType dt);
+std::string DtToCLCMDDt(const DataType dt);
-std::string DataTypeToCLType(const DataType dt);
+std::string DtToUpstreamCLCMDDt(const DataType dt);
+std::string DtToCLDt(const DataType dt);
+std::string DtToUpstreamCLDt(const DataType dt);
 }  // namespace kernels
 } //  namespace mace

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -10,131 +10,94 @@
 namespace mace {
 namespace kernels {
-static void Pooling3(const Tensor *input,
+static void Pooling(const Tensor *input,
-                     const int *stride,
+                    const int *stride,
-                     const PoolingType type,
+                    const int *paddings,
-                     Tensor *output) {
+                    const int pooling_size,
-  if (type != MAX) {
+                    const PoolingType type,
-    MACE_NOT_IMPLEMENTED;
+                    const DataType dt,
-  }
+                    Tensor *output) {
  index_t batch = output->dim(0);
-  index_t channels = output->dim(1);
+  index_t out_height = output->dim(1);
-  index_t out_height = output->dim(2);
+  index_t out_width = output->dim(2);
-  index_t out_width = output->dim(3);
+  index_t channels = output->dim(3);
-  index_t channel_blk = (channels + 3) / 4;
+  index_t channel_blocks = (channels + 3) / 4;
-  const index_t pixel_width = (out_width + 3) / 4 ;
  const uint32_t gws[3] = {
-      static_cast<uint32_t>(batch),
+      static_cast<uint32_t>(channel_blocks),
-      static_cast<uint32_t>(channel_blk),
+      static_cast<uint32_t>(out_width),
-      static_cast<uint32_t>(pixel_width * out_height),
+      static_cast<uint32_t>(batch * out_height),
  };
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  if (type == MAX && input->dtype() == output->dtype()) {
-  built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : "");
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-  auto pooling_kernel  = runtime->BuildKernel("pooling", "pooling3", built_options);
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
+  } else {
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  }
+  if (type == AVG) {
+    built_options.emplace("-DPOOL_AVG");
+  }
+  auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options);
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
-  const uint32_t lws[3] = {1, 8, 128};
+  uint32_t lws[3];
+  lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+  lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
+  lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
  uint32_t idx = 0;
-  pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
+  pooling_kernel.setArg(idx++, paddings[0] / 2);
+  pooling_kernel.setArg(idx++, paddings[1] / 2);
  pooling_kernel.setArg(idx++, stride[0]);
-  pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  pooling_kernel.setArg(idx++, pooling_size);
+  pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      pooling_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
      cl::NDRange(lws[0], lws[1], lws[2]),
      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
+  MACE_CHECK(error == CL_SUCCESS) << error;
 }
-static void PoolingN(const Tensor *input,
+template<typename T>
-                     const int *stride,
+void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
-                     const int *paddings,
+                                                       Tensor *output) {
-                     const int pooling_size,
+  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet";
-                     const PoolingType type,
+  std::vector<index_t> output_shape(4);
-                     Tensor *output) {
+  std::vector<int> paddings(2);
-  if (type != AVG) {
+  std::vector<index_t> filter_shape = {
-    MACE_NOT_IMPLEMENTED;
+      kernels_[0], kernels_[1],
-  }
+      input->dim(3), input->dim(3)
-  index_t batch = output->dim(0);
-  index_t channels = output->dim(1);
-  index_t out_height = output->dim(2);
-  index_t out_width = output->dim(3);
-  index_t channel_blk = (channels + 3) / 4;
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(batch),
-      static_cast<uint32_t>(channel_blk),
-      static_cast<uint32_t>(out_height * out_width),
  };
-  auto runtime = OpenCLRuntime::Get();
+  kernels::CalcNHWCPaddingAndOutputSize(
-  std::set<std::string> built_options;
+      input->shape().data(), filter_shape.data(),
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+      dilations_, strides_, this->padding_,
-  auto pooling_kernel  = runtime->BuildKernel("pooling", "poolingn", built_options);
+      output_shape.data(), paddings.data());
-  const uint32_t lws[3] = {1, 8, 128};
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+  output->ResizeImage(output_shape, output_image_shape);
-  uint32_t idx = 0;
+  Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
-  pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+          DataTypeToEnum<T>::value, output);
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
-  pooling_kernel.setArg(idx++, stride[0]);
-  pooling_kernel.setArg(idx++, paddings[0]);
-  pooling_kernel.setArg(idx++, paddings[1]);
-  pooling_kernel.setArg(idx++, pooling_size);
-  pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      pooling_kernel, cl::NullRange,
-      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
-}
-template <>
-void PoolingFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
-                                                          Tensor *output) {
-  int paddings[2];
-  std::vector<index_t> filter_shape = {input->dim(1), input->dim(0),
-                                       kernels_[0], kernels_[1]};
-  kernels::CalPaddingSize(input->shape().data(), filter_shape.data(), this->dilations_,
-                          strides_, this->padding_, paddings);
-#define POOLING_HELPER                                               \
-  switch(kernels_[0]) {                                              \
-    case 3:                                                          \
-      Pooling3(input, strides_, pooling_type_, output);              \
-      break;                                                         \
-    default:                                                         \
-      PoolingN(input, strides_, paddings, kernels_[0],               \
-               pooling_type_, output);                               \
-      break;                                                         \
-  }
-  if (paddings[0] > 0 || paddings[1] > 0) {
-    Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
-    ConstructInputWithPadding(input, paddings, &padded_input, pooling_type_ == MAX);
-    input = &padded_input;
-    POOLING_HELPER
-  } else {
-    POOLING_HELPER
-  }
-#undef POOLING_HELPER
 }
+template
+struct PoolingFunctor<DeviceType::OPENCL, float>;
+template
+struct PoolingFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -6,24 +6,33 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/resize_bilinear.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-template <>
+template <typename T>
-void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
+void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *input, const Tensor *resize_dims, Tensor *output) {
  const index_t batch = input->dim(0);
-  const index_t channels = input->dim(1);
+  const index_t in_height = input->dim(1);
-  const index_t in_height = input->dim(2);
+  const index_t in_width = input->dim(2);
-  const index_t in_width = input->dim(3);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
  index_t out_height;
  index_t out_width;
  GetOutputSize(resize_dims, &out_height, &out_width);
  MACE_CHECK(out_height > 0 && out_width > 0);
-  std::vector<index_t> out_shape {batch, channels, out_height, out_width};
+  std::vector<index_t> output_shape {batch, out_height, out_width, channels};
-  output->Resize(out_shape);
+  if (input->is_image()) {
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+    output->ResizeImage(output_shape, output_image_shape);
+  } else {
+    output->Resize(output_shape);
+  }
  float height_scale =
      CalculateResizeScale(in_height, out_height, align_corners_);
@@ -31,29 +40,37 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto dt = DataTypeToEnum<T>::value;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
  auto rb_kernel  = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
  uint32_t idx = 0;
-  rb_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
-  rb_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
  rb_kernel.setArg(idx++, height_scale);
  rb_kernel.setArg(idx++, width_scale);
-  rb_kernel.setArg(idx++, static_cast<int>(in_height));
+  rb_kernel.setArg(idx++, static_cast<int32_t>(in_height));
-  rb_kernel.setArg(idx++, static_cast<int>(in_width));
+  rb_kernel.setArg(idx++, static_cast<int32_t>(in_width));
+  rb_kernel.setArg(idx++, static_cast<int32_t>(out_height));
  auto command_queue = runtime->command_queue();
  cl_int error = command_queue.enqueueNDRangeKernel(
      rb_kernel, cl::NullRange,
-      cl::NDRange(static_cast<int>(batch * channels),
+      cl::NDRange(static_cast<int32_t>(channel_blocks),
-                  static_cast<int>(out_height), static_cast<int>(out_width)),
+                  static_cast<int32_t>(out_width),
-      // TODO (heliangliang) tuning and fix when kwg_size < devisor
+                  static_cast<int32_t>(out_height * batch)),
-      cl::NDRange(1, 16, kwg_size / 16),
+      // TODO tuning
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+      cl::NDRange(1, static_cast<int32_t>(out_width > kwg_size ? kwg_size : out_width), 1),
+      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }
+template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
+template struct ResizeBilinearFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opecl.cc
@@ -20,7 +20,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
                                                                Tensor *batch_tensor) {
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(space_tensor->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(space_tensor->dtype()));
  auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);
  uint32_t idx = 0;

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -18,36 +18,66 @@ enum PoolingType {
 namespace kernels {
-template <DeviceType D, typename T>
+struct PoolingFunctorBase {
-struct PoolingFunctor {
+  PoolingFunctorBase(const PoolingType pooling_type,
-  PoolingFunctor(const PoolingType pooling_type,
+                     const int *kernels,
-                 const int *kernels,
+                     const int *strides,
-                 const int *strides,
+                     const Padding padding,
-                 const Padding padding,
+                     const int *dilations)
-                 const int *dilations)
      : pooling_type_(pooling_type),
        kernels_(kernels),
        strides_(strides),
        padding_(padding),
        dilations_(dilations) {}
+  const PoolingType pooling_type_;
+  const int *kernels_;
+  const int *strides_;
+  const Padding padding_;
+  const int *dilations_;
+};
+template<DeviceType D, typename T>
+struct PoolingFunctor : PoolingFunctorBase {
+  PoolingFunctor(const PoolingType pooling_type,
+                 const int *kernels,
+                 const int *strides,
+                 const Padding padding,
+                 const int *dilations)
+      : PoolingFunctorBase(pooling_type, kernels,
+                           strides, padding,
+                           dilations) {}
  void operator()(const Tensor *input_tensor,
                  Tensor *output_tensor) {
+    std::vector<index_t> output_shape(4);
+    std::vector<int> paddings(2);
+    std::vector<index_t> filter_shape = {
+        kernels_[0], kernels_[1],
+        input_tensor->dim(3), input_tensor->dim(3)
+    };
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input_tensor->shape().data(), filter_shape.data(),
+        dilations_, strides_, this->padding_,
+        output_shape.data(), paddings.data());
+    output_tensor->Resize(output_shape);
    Tensor::MappingGuard in_guard(input_tensor);
    Tensor::MappingGuard out_guard(output_tensor);
    const T *input = input_tensor->data<T>();
    T *output = output_tensor->mutable_data<T>();
    const index_t *input_shape = input_tensor->shape().data();
-    const index_t *output_shape = output_tensor->shape().data();
    index_t batch = output_shape[0];
-    index_t channels = output_shape[1];
+    index_t height = output_shape[1];
-    index_t height = output_shape[2];
+    index_t width = output_shape[2];
-    index_t width = output_shape[3];
+    index_t channels = output_shape[3];
    index_t out_image_size = height * width;
-    index_t input_channels = input_shape[1];
+    index_t input_height = input_shape[1];
-    index_t input_height = input_shape[2];
+    index_t input_width = input_shape[2];
-    index_t input_width = input_shape[3];
+    index_t input_channels = input_shape[3];
    index_t in_image_size = input_height * input_width;
    int kernel_h = kernels_[0];
@@ -59,11 +89,6 @@ struct PoolingFunctor {
    int dilation_h = dilations_[0];
    int dilation_w = dilations_[1];
-    int paddings[2];
-    std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
-                                         kernels_[0], kernels_[1]};
-    kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
-                            strides_, this->padding_, paddings);
    // The left-upper most offset of the padded input
    int padded_h_start = 0 - paddings[0] / 2;
    int padded_w_start = 0 - paddings[1] / 2;
@@ -71,25 +96,24 @@ struct PoolingFunctor {
    if (pooling_type_ == MAX) {
 #pragma omp parallel for collapse(2)
      for (int b = 0; b < batch; ++b) {
-        for (int c = 0; c < channels; ++c) {
+        for (int h = 0; h < height; ++h) {
-          index_t out_offset = (b * channels + c) * out_image_size;
+          for (int w = 0; w < width; ++w) {
-          index_t in_offset = (b * input_channels + c) * in_image_size;
+            for (int c = 0; c < channels; ++c) {
-          for (int h = 0; h < height; ++h) {
+              index_t in_offset = b * in_image_size * input_channels + c;
-            for (int w = 0; w < width; ++w) {
+              T res = std::numeric_limits<T>::lowest();
-              T max = std::numeric_limits<T>::lowest();
              for (int kh = 0; kh < kernel_h; ++kh) {
                for (int kw = 0; kw < kernel_w; ++kw) {
                  int inh = padded_h_start + h * stride_h + dilation_h * kh;
                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
                  if (inh >= 0 && inh < input_height && inw >= 0 &&
                      inw < input_width) {
-                    index_t input_offset = in_offset + inh * input_width + inw;
+                    index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
-                    max = std::max(max, input[input_offset]);
+                    res = std::max(res, input[input_offset]);
                  }
                }
              }
-              output[out_offset] = max;
+              *output = res;
-              out_offset += 1;
+              output++;
            }
          }
        }
@@ -97,11 +121,10 @@ struct PoolingFunctor {
    } else if (pooling_type_ == AVG) {
 #pragma omp parallel for collapse(2)
      for (int b = 0; b < batch; ++b) {
-        for (int c = 0; c < channels; ++c) {
+        for (int h = 0; h < height; ++h) {
-          index_t out_offset = (b * channels + c) * out_image_size;
+          for (int w = 0; w < width; ++w) {
-          index_t in_offset = (b * input_channels + c) * in_image_size;
+            for (int c = 0; c < channels; ++c) {
-          for (int h = 0; h < height; ++h) {
+              index_t in_offset = b * in_image_size * input_channels + c;
-            for (int w = 0; w < width; ++w) {
              T sum = 0;
              int block_size = 0;
              for (int kh = 0; kh < kernel_h; ++kh) {
@@ -110,14 +133,14 @@ struct PoolingFunctor {
                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
                  if (inh >= 0 && inh < input_height && inw >= 0 &&
                      inw < input_width) {
-                    index_t input_offset = in_offset + inh * input_width + inw;
+                    index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
                    sum += input[input_offset];
                    block_size += 1;
                  }
                }
              }
-              output[out_offset] = sum / block_size;
+              *output = sum / block_size;
-              out_offset += 1;
+              output++;
            }
          }
        }
@@ -125,22 +148,26 @@ struct PoolingFunctor {
    }
  }
-  const PoolingType pooling_type_;
-  const int *kernels_;
-  const int *strides_;
-  const Padding padding_;
-  const int *dilations_;
 };
-template <>
+template<>
 void PoolingFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *input_tensor,
    Tensor *output_tensor);
-template <>
+template<typename T>
-void PoolingFunctor<DeviceType::OPENCL, float>::operator()(
+struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
-    const Tensor *input_tensor,
+  PoolingFunctor(const PoolingType pooling_type,
-    Tensor *output_tensor);
+                 const int *kernels,
+                 const int *strides,
+                 const Padding padding,
+                 const int *dilations)
+      : PoolingFunctorBase(pooling_type, kernels,
+                           strides, padding,
+                           dilations) {}
+  void operator()(const Tensor *input_tensor,
+                  Tensor *output_tensor);
+};
 }  //  namespace kernels
 }  //  namespace mace

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -61,63 +61,90 @@ void ResizeImage(const T *images,
                 const index_t channels,
                 const std::vector<CachedInterpolation> &xs_vec,
                 const std::vector<CachedInterpolation> &ys,
-                 float *output) {
+                 T *output) {
-  const index_t in_channel_size = in_height * in_width;
+  const index_t in_batch_num_values = channels * in_height * in_width;
-  const index_t in_batch_num_values = channels * in_channel_size;
+  const index_t out_batch_num_values = channels * out_height * out_width;
-  const index_t out_channel_size = out_height * out_width;
-  const index_t out_batch_num_values = channels * out_channel_size;
  const CachedInterpolation *xs = xs_vec.data();
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for
  for (index_t b = 0; b < batch_size; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
+    const T *batch_input_ptr = images + in_batch_num_values * b;;
-      const T *input_ptr =
+    T *batch_output_ptr = output + out_batch_num_values * b;
-          images + in_batch_num_values * b + in_channel_size * c;
-      float *output_ptr =
+    for (index_t y = 0; y < out_height; ++y) {
-          output + out_batch_num_values * b + out_channel_size * c;
+      const T *y_lower_input_ptr =
-      for (index_t y = 0; y < out_height; ++y) {
+        batch_input_ptr + ys[y].lower * in_width * channels;
-        const T *ys_input_lower_ptr = input_ptr + ys[y].lower * in_width;
+      const T *y_upper_input_ptr =
-        const T *ys_input_upper_ptr = input_ptr + ys[y].upper * in_width;
+        batch_input_ptr + ys[y].upper * in_width * channels;
-        const float ys_lerp = ys[y].lerp;
+      T *y_output_ptr = batch_output_ptr + y * out_width * channels;
-        for (index_t x = 0; x < out_width; ++x) {
+      const float ys_lerp = ys[y].lerp;
-          auto xs_lower = xs[x].lower;
-          auto xs_upper = xs[x].upper;
+      for (index_t x = 0; x < out_width; ++x) {
-          auto xs_lerp = xs[x].lerp;
+        const float xs_lerp = xs[x].lerp;
+        const T *top_left_ptr = y_lower_input_ptr + xs[x].lower * channels;
-          const float top_left = ys_input_lower_ptr[xs_lower];
+        const T *top_right_ptr = y_lower_input_ptr + xs[x].upper * channels;
-          const float top_right = ys_input_lower_ptr[xs_upper];
+        const T *bottom_left_ptr = y_upper_input_ptr + xs[x].lower * channels;
-          const float bottom_left = ys_input_upper_ptr[xs_lower];
+        const T *bottom_right_ptr = y_upper_input_ptr + xs[x].upper * channels;
-          const float bottom_right = ys_input_upper_ptr[xs_upper];
+        T *output_ptr = y_output_ptr + x * channels;
-          output_ptr[x] = ComputeLerp(top_left, top_right, bottom_left,
+        for (index_t c = 0; c < channels; ++c) {
-                                      bottom_right, xs_lerp, ys_lerp);
+          const T top_left = top_left_ptr[c];
+          const T top_right = top_right_ptr[c];
+          const T bottom_left = bottom_left_ptr[c];
+          const T bottom_right = bottom_right_ptr[c];
+          output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left,
+              bottom_right, xs_lerp, ys_lerp);
        }
-        output_ptr += out_width;
      }
    }
  }
 }
 }
+struct ResizeBilinearFunctorBase {
+  ResizeBilinearFunctorBase(const std::vector<index_t> &size,
+                            bool align_corners)
+      : align_corners_(align_corners), size_(size) {}
+ protected:
+  void GetOutputSize(const Tensor *resize_dims,
+                     index_t *out_height,
+                     index_t *out_width) {
+    if (size_[0] < 0 || size_[1] < 0) {
+      MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1);
+      Tensor::MappingGuard resize_dims_mapper(resize_dims);
+      auto dims_data = resize_dims->data<int32_t>();
+      *out_height = dims_data[0];
+      *out_width = dims_data[1];
+    } else {
+      *out_height = size_[0];
+      *out_width = size_[1];
+    }
+  }
+  bool align_corners_;
+  std::vector<index_t> size_;
+};
 template <DeviceType D, typename T>
-class ResizeBilinearFunctor {
+struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
- public:
  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
-      : align_corners_(align_corners), size_(size) {}
+      : ResizeBilinearFunctorBase(size, align_corners) {}
  void operator()(const Tensor *input,
                  const Tensor *resize_dims,
                  Tensor *output) {
    const index_t batch = input->dim(0);
-    const index_t channels = input->dim(1);
+    const index_t in_height = input->dim(1);
-    const index_t in_height = input->dim(2);
+    const index_t in_width = input->dim(2);
-    const index_t in_width = input->dim(3);
+    const index_t channels = input->dim(3);
    index_t out_height;
    index_t out_width;
    GetOutputSize(resize_dims, &out_height, &out_width);
    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> out_shape{batch, channels, out_height, out_width};
+    std::vector<index_t> out_shape{batch, out_height, out_width, channels};
    output->Resize(out_shape);
    Tensor::MappingGuard input_mapper(input);
@@ -146,32 +173,18 @@ class ResizeBilinearFunctor {
    ResizeImage(input_data, batch, in_height, in_width, out_height, out_width,
                channels, xs, ys, output_data);
  }
+};
- protected:
+template<typename T>
-  void GetOutputSize(const Tensor *resize_dims,
+struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase {
-                     index_t *out_height,
+  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
-                     index_t *out_width) {
+      : ResizeBilinearFunctorBase(size, align_corners) {}
-    if (size_[0] < 0 || size_[1] < 0) {
-      MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1);
-      Tensor::MappingGuard resize_dims_mapper(resize_dims);
-      auto dims_data = resize_dims->data<int32_t>();
-      *out_height = dims_data[0];
-      *out_width = dims_data[1];
-    } else {
-      *out_height = size_[0];
-      *out_width = size_[1];
-    }
-  }
- private:
+  void operator()(const Tensor *input,
-  bool align_corners_;
+                  const Tensor *resize_dims,
-  std::vector<index_t> size_;
+                  Tensor *output);
 };
-template <>
-void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
-    const Tensor *input, const Tensor *resize_dims, Tensor *output);
 }  // namespace kernels
 }  // namespace mace

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -22,4 +22,10 @@ def if_android_arm64(a):
  return select({
      "//mace:android_arm64": a,
      "//conditions:default": [],
  })
\ No newline at end of file
+def if_profiling(a):
+  return select({
+      "//mace:is_profiling": a,
+      "//conditions:default": [],
+  })
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -6,12 +6,26 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(AddN, AddNOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      AddNOp<DeviceType::CPU, float>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(AddN, AddNOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       AddNOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
-REGISTER_OPENCL_OPERATOR(AddN, AddNOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         AddNOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         AddNOp<DeviceType::OPENCL, half>);
 }  //  namespace mace
--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -10,7 +10,7 @@
 namespace mace {
-template<DeviceType D, class T>
+template <DeviceType D, class T>
 class AddNOp : public Operator<D, T> {
 public:
  AddNOp(const OperatorDef &operator_def, Workspace *ws)
@@ -18,7 +18,6 @@ class AddNOp : public Operator<D, T> {
  bool Run() override {
    Tensor *output_tensor = this->outputs_[0];
-    output_tensor->ResizeLike(this->inputs_[0]);
    int n = this->inputs_.size();
    vector<const Tensor *> inputs(n, nullptr);
    for (int i = 0; i < n; ++i) {

--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -9,47 +9,69 @@
 namespace mace {
 template <DeviceType D, typename T>
-static void AddNBenchmark(int iters, int n, int size) {
+static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
  mace::testing::StopTiming();
  OpsTestNet net;
-  OpDefBuilder op_def_builder("AddN", "AddNBM");
+  // Add input data
-  for (int i = 0; i < n; ++i) {
+  for (int i = 0; i < inputs; ++i) {
-    op_def_builder.Input(internal::MakeString("Input", i).c_str());
+    net.AddRandomInput<D, float>(
+        internal::MakeString("Input", i).c_str(), {n, h, w, c});
  }
-  op_def_builder.Output("Output").Finalize(net.NewOperatorDef());
-  // Add input data
+  if (D == DeviceType::OPENCL) {
-  for (int i = 0; i < n; ++i) {
+    for (int i = 0; i < inputs; ++i) {
-    net.AddRandomInput<DeviceType::CPU, float>(internal::MakeString("Input", i).c_str(), {size});
+      BufferToImage<D, T>(net, internal::MakeString("Input", i).c_str(),
+                          internal::MakeString("InputImage", i).c_str(),
+                          kernels::BufferType::IN_OUT);
+    }
+    OpDefBuilder op_def_builder("AddN", "AddNBM");
+    for (int i = 0; i < inputs; ++i) {
+      op_def_builder.Input(internal::MakeString("InputImage", i).c_str());
+    }
+    op_def_builder.Output("OutputImage")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder op_def_builder("AddN", "AddNBM");
+    for (int i = 0; i < inputs; ++i) {
+      op_def_builder.Input(internal::MakeString("Input", i).c_str());
+    }
+    op_def_builder.Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
  }
  // Warm-up
  for (int i = 0; i < 5; ++i) {
    net.RunOp(D);
+    net.Sync();
  }
  mace::testing::StartTiming();
  while (iters--) {
    net.RunOp(D);
+    net.Sync();
  }
 }
-#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE)                        \
+#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE)                     \
-  static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE(int iters) { \
+  static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
-    const int64_t tot = static_cast<int64_t>(iters) * N * SIZE;     \
+      int iters) {                                                          \
-    mace::testing::ItemsProcessed(tot);                             \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;        \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));             \
+    mace::testing::ItemsProcessed(tot);                                     \
-    AddNBenchmark<DEVICE, TYPE>(iters, N, SIZE);                    \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                     \
-  }                                                                 \
+    AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C);                 \
-  BENCHMARK(BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE)
+  }                                                                         \
+  BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
-#define BM_ADDN(N, SIZE, TYPE)       \
-  BM_ADDN_MACRO(N, SIZE, TYPE, CPU); \
+#define BM_ADDN(INPUTS, N, H, W, C, TYPE)       \
-  BM_ADDN_MACRO(N, SIZE, TYPE, NEON);
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);
-BM_ADDN(10, 1000, float);
-BM_ADDN(10, 10000, float);
+BM_ADDN(2, 1, 240, 240, 256, float);
-BM_ADDN(100, 1000, float);
+// BM_ADDN(2, 1, 240, 240, 256, half);
-BM_ADDN(100, 10000, float);
+BM_ADDN(4, 1, 240, 240, 256, float);
-}  //  namespace mace
+// BM_ADDN(4, 1, 240, 240, 256, half);
\ No newline at end of file
+}  //  namespace mace
--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -9,7 +9,7 @@ namespace mace {
 class AddnOpTest : public OpsTestBase {};
-template<DeviceType D>
+template <DeviceType D>
 void SimpleAdd2() {
  // Construct graph
  OpsTestNet net;
@@ -20,30 +20,26 @@ void SimpleAdd2() {
      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
-  net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
  // Run
  net.RunOp(D);
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {2, 4, 6, 8, 10, 12});
+  auto expected = CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
-TEST_F(AddnOpTest, CPUSimpleAdd2) {
+TEST_F(AddnOpTest, CPUSimpleAdd2) { SimpleAdd2<DeviceType::CPU>(); }
-  SimpleAdd2<DeviceType::CPU>();
-}
-TEST_F(AddnOpTest, NEONSimpleAdd2) {
+/*
-  SimpleAdd2<DeviceType::NEON>();
+TEST_F(AddnOpTest, NEONSimpleAdd2) { SimpleAdd2<DeviceType::NEON>(); }
-}
-TEST_F(AddnOpTest, OPENCLSimpleAdd2) {
+TEST_F(AddnOpTest, OPENCLSimpleAdd2) { SimpleAdd2<DeviceType::OPENCL>(); }
-  SimpleAdd2<DeviceType::OPENCL>();
+*/
-}
-template<DeviceType D>
+template <DeviceType D>
 void SimpleAdd3() {
  // Construct graph
  OpsTestNet net;
@@ -55,62 +51,80 @@ void SimpleAdd3() {
      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
-  net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
-  net.AddInputFromArray<D, float>("Input3", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
  // Run
  net.RunOp(D);
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {3, 6, 9, 12, 15, 18});
+  auto expected = CreateTensor<float>({1, 2, 3, 1}, {3, 6, 9, 12, 15, 18});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
-TEST_F(AddnOpTest, CPUSimpleAdd3) {
+TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
-  SimpleAdd3<DeviceType::CPU>();
-}
-TEST_F(AddnOpTest, NEONSimpleAdd3) {
+/*
-  SimpleAdd3<DeviceType::NEON>();
+TEST_F(AddnOpTest, NEONSimpleAdd3) { SimpleAdd3<DeviceType::NEON>(); }
-}
+*/
-template<DeviceType D>
+template <DeviceType D>
 void RandomTest() {
-  // Construct graph
+  testing::internal::LogToStderr();
-  OpsTestNet net;
+  srand(time(NULL));
-  OpDefBuilder("AddN", "AddNTest")
-      .Input("Input1")
+  for (int round = 0; round < 10; ++round) {
-      .Input("Input2")
+    // generate random input
-      .Output("Output")
+    index_t n = 1 + (rand() % 5);
-      .Finalize(net.NewOperatorDef());
+    index_t h = 1 + (rand() % 100);
+    index_t w = 1 + (rand() % 100);
-  // Add input data
+    index_t c = 1 + (rand() % 32);
-  net.AddRandomInput<D, float>("Input1", {1, 2, 3, 4});
+    int input_num = 2 + rand() % 3;
-  net.AddRandomInput<D, float>("Input2", {1, 2, 3, 4});
+    // Construct graph
+    OpsTestNet net;
-  // Check
+    auto op_def = OpDefBuilder("AddN", "AddNTest");
-  net.RunOp(D);
+    for (int i = 0; i < input_num; ++i) {
+      op_def.Input("Input" + ToString(i));
-  Tensor result;
+    }
-  result.Copy(*net.GetOutput("Output"));
+    op_def.Output("Output").Finalize(net.NewOperatorDef());
-  // Run
+    // Add input data
-  net.RunOp();
+    for (int i = 0; i < input_num; ++i) {
+      net.AddRandomInput<D, float>("Input" + ToString(i), {n, h, w, c});
-  ExpectTensorNear<float>(*net.GetOutput("Output"), result, 1e-5);
+    }
-}
+    // run on cpu
-TEST_F(AddnOpTest, CPURandom) {
+    net.RunOp();
-  RandomTest<DeviceType::CPU>();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+    // run on gpu
+    for (int i = 0; i < input_num; ++i) {
+      BufferToImage<D, half>(net, "Input" + ToString(i),
+                             "InputImage" + ToString(i),
+                             kernels::BufferType::IN_OUT);
+    }
+    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
+    for (int i = 0; i < input_num; ++i) {
+      op_def_cl.Input("InputImage" + ToString(i));
+    }
+    op_def_cl.Output("OutputImage")
+        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
+                            kernels::BufferType::IN_OUT);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.1);
+  }
 }
-TEST_F(AddnOpTest, NEONRandom) {
+TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
-  RandomTest<DeviceType::NEON>();
-}
-TEST_F(AddnOpTest, OPENCLRandom) {
-  RandomTest<DeviceType::OPENCL>();
-}
 }  // namespace mace
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -6,12 +6,26 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(BatchNorm, BatchNormOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("BatchNorm")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      BatchNormOp<DeviceType::CPU, float>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(BatchNorm, BatchNormOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("BatchNorm")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       BatchNormOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
-REGISTER_OPENCL_OPERATOR(BatchNorm, BatchNormOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchNorm")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         BatchNormOp<DeviceType::OPENCL, float>);
-}  //  namespace mace
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchNorm")
\ No newline at end of file
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         BatchNormOp<DeviceType::OPENCL, half>);
+}  //  namespace mace
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -13,28 +13,45 @@ static void BatchNorm(
    int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();
-  if ( D == OPENCL )
-    OpenCLRuntime::EnableProfiling();
  OpsTestNet net;
-  OpDefBuilder("BatchNorm", "BatchNormBM")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Input("Mean")
-      .Input("Var")
-      .Input("Epsilon")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
+  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  net.AddRandomInput<D, T>("Scale", {channels});
  net.AddRandomInput<D, T>("Offset", {channels});
  net.AddRandomInput<D, T>("Mean", {channels});
  net.AddRandomInput<D, T>("Var", {channels}, true);
  net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("BatchNorm", "BatchNormBM")
+        .Input("InputImage")
+        .Input("ScaleImage")
+        .Input("OffsetImage")
+        .Input("MeanImage")
+        .Input("VarImage")
+        .Input("Epsilon")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  }
+  else {
+    OpDefBuilder("BatchNorm", "BatchNormBM")
+        .Input("Input")
+        .Input("Scale")
+        .Input("Offset")
+        .Input("Mean")
+        .Input("Var")
+        .Input("Epsilon")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  }
  // tuning
  setenv("MACE_TUNING", "1", 1);
  net.RunOp(D);

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -11,20 +11,10 @@ class BatchNormOpTest : public OpsTestBase {};
 template <DeviceType D>
 void Simple() {
-  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Input("Mean")
-      .Input("Var")
-      .Input("Epsilon")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddInputFromArray<D, float>("Input", {1, 1, 6, 2},
+  net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
                               {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
  net.AddInputFromArray<D, float>("Scale", {1}, {4.0f});
  net.AddInputFromArray<D, float>("Offset", {1}, {2.0});
@@ -32,12 +22,44 @@ void Simple() {
  net.AddInputFromArray<D, float>("Var", {1}, {11.67f});
  net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
-  // Run
+  if (D == DeviceType::OPENCL) {
-  net.RunOp(D);
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("BatchNorm", "BatchNormTest")
+        .Input("InputImage")
+        .Input("ScaleImage")
+        .Input("OffsetImage")
+        .Input("MeanImage")
+        .Input("VarImage")
+        .Input("Epsilon")
+        .Output("OutputImage")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("BatchNorm", "BatchNormTest")
+        .Input("Input")
+        .Input("Scale")
+        .Input("Offset")
+        .Input("Mean")
+        .Input("Var")
+        .Input("Epsilon")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
  // Check
  auto expected =
-      CreateTensor<float>({1, 1, 6, 2}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
+      CreateTensor<float>({1, 6, 2, 1}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
                                         3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
@@ -47,14 +69,17 @@ TEST_F(BatchNormOpTest, SimpleCPU) {
  Simple<DeviceType::CPU>();
 }
+/*
 TEST_F(BatchNormOpTest, SimpleNEON) {
  Simple<DeviceType::NEON>();
 }
+*/
 TEST_F(BatchNormOpTest, SimpleOPENCL) {
  Simple<DeviceType::OPENCL>();
 }
+/*
 TEST_F(BatchNormOpTest, SimpleRandomNeon) {
  srand(time(NULL));
@@ -136,6 +161,7 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) {
  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
 }
+*/
 TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  srand(time(NULL));
@@ -145,6 +171,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  index_t channels = 3 + rand() % 50;
  index_t height = 64;
  index_t width = 64;
  // Construct graph
  auto &net = test_net();
  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -158,29 +185,48 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, channels, height, width});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, height, width, channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
  net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
-  // tuning
+  // run cpu
+  net.RunOp();
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+  // Run on opencl
+  BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+      .Input("InputImage")
+      .Input("ScaleImage")
+      .Input("OffsetImage")
+      .Input("MeanImage")
+      .Input("VarImage")
+      .Input("Epsilon")
+      .Output("OutputImage")
+      .Finalize(net.NewOperatorDef());
+  // Tuning
  setenv("MACE_TUNING", "1", 1);
  net.RunOp(DeviceType::OPENCL);
  unsetenv("MACE_TUNING");
  // Run on opencl
  net.RunOp(DeviceType::OPENCL);
+  net.Sync();
-  // Check
+  ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
-  Tensor expected;
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
-  expected.Copy(*net.GetOutput("Output"));
-  // run cpu
-  net.RunOp();
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
 }
 TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
@@ -191,6 +237,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  index_t channels = 3 + rand() % 50;
  index_t height = 103;
  index_t width = 113;
  // Construct graph
  auto &net = test_net();
  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -204,13 +251,38 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, channels, height, width});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, height, width, channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
  net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
+  // run cpu
+  net.RunOp();
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+  // Run on opencl
+  BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+      .Input("InputImage")
+      .Input("ScaleImage")
+      .Input("OffsetImage")
+      .Input("MeanImage")
+      .Input("VarImage")
+      .Input("Epsilon")
+      .Output("OutputImage")
+      .Finalize(net.NewOperatorDef());
  // tuning
  setenv("MACE_TUNING", "1", 1);
  net.RunOp(DeviceType::OPENCL);
@@ -220,14 +292,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  net.RunOp(DeviceType::OPENCL);
  net.Sync();
-  // Check
+  ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
-  Tensor expected;
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
-  expected.Copy(*net.GetOutput("Output"));
-  // run cpu
-  net.RunOp();
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
 }
 }
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -6,6 +6,9 @@
 namespace mace {
-REGISTER_OPENCL_OPERATOR(BatchToSpaceND, BatchToSpaceNDOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchToSpaceND")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         BatchToSpaceNDOp<DeviceType::OPENCL, float>);
 }  //  namespace mace
--- a/mace/ops/buffer_to_image.cc
+++ b/mace/ops/buffer_to_image.cc
@@ -6,6 +6,14 @@
 namespace mace {
-REGISTER_OPENCL_OPERATOR(BufferToImage, BufferToImageOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         BufferToImageOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         BufferToImageOp<DeviceType::OPENCL, half>);
 }  //  namespace mace
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
      .Input("Input")
      .Output("B2IOutput")
      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
      .Finalize(net.NewOperatorDef());
  // Add input data
@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
      .Input("B2IOutput")
      .Output("I2BOutput")
      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
      .Finalize(net.NewOperatorDef());
  // Run
@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) {
  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
 }
+TEST(BufferToImageTest, ArgHalfSmall) {
+  TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
+}
 TEST(BufferToImageTest, ArgMedia) {
  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
 }
@@ -91,3 +97,36 @@ TEST(BufferToImageTest, Filter3x3Meida) {
 TEST(BufferToImageTest, Filter3x3Large) {
  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256});
 }
+template<DeviceType D, typename T>
+void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
+  OpsTestNet net;
+  OpDefBuilder("BufferToImage", "BufferToImageTest")
+      .Input("Input")
+      .Output("B2IOutput")
+      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
+      .Finalize(net.NewOperatorDef());
+  // Add input data
+  net.AddRandomInput<D, float>("Input", input_shape);
+  // Run
+  net.RunOp(D);
+  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
+      .Input("B2IOutput")
+      .Output("I2BOutput")
+      .AddIntArg("buffer_type", type)
+      .Finalize(net.NewOperatorDef());
+  // Run
+  net.RunOp(D);
+  // Check
+  ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-3);
+}
+TEST(BufferToImageTest, ArgFloatToHalfSmall) {
+  TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
+}
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -6,6 +6,9 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(ChannelShuffle, ChannelShuffleOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("ChannelShuffle")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      ChannelShuffleOp<DeviceType::CPU, float>);
 }  // namespace mace
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -6,6 +6,9 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(Concat, ConcatOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Concat")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      ConcatOp<DeviceType::CPU, float>);
 }  // namespace mace
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -6,12 +6,31 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(Conv2D, Conv2dOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      Conv2dOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
+                          .TypeConstraint<half>("T")
+                          .Build(),
+                      Conv2dOp<DeviceType::CPU, half>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(Conv2D, Conv2dOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       Conv2dOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
-REGISTER_OPENCL_OPERATOR(Conv2D, Conv2dOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         Conv2dOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         Conv2dOp<DeviceType::OPENCL, half>);
 }  // namespace mace
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -33,9 +33,9 @@ static void Conv2d(int iters,
  net.AddRandomInput<D, float>("Bias", {output_channels});
  if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
        .Input("FilterImage")
@@ -44,6 +44,7 @@ static void Conv2d(int iters,
        .AddIntsArg("strides", {stride, stride})
        .AddIntArg("padding", padding)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
  } else {
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -54,6 +55,7 @@ static void Conv2d(int iters,
        .AddIntsArg("strides", {stride, stride})
        .AddIntArg("padding", padding)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
  }
@@ -91,39 +93,39 @@ static void Conv2d(int iters,
  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
 // ICNet
-BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float);
+BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
-BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float);
 // SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
-BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float);
+BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
 // SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
-BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float);
+BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
+BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
-BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, VALID, 32, float);
+BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
 // Test RGB <-> YUV
-BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
-BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
+//
-BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments
+//BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments
-BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
-BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
+//BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
-BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
-BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
+//BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
-BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
+//BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
-BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
+//BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
-BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
+//BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
-BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
 }  //  namespace mace
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -84,23 +84,23 @@ TEST_F(Conv2dOpTest, NEONSimple) {
  TestSimple3x3SAME<DeviceType::NEON>();
 }
-template<DeviceType D>
+template<DeviceType D, typename T>
 void TestNHWCSimple3x3VALID() {
  OpsTestNet net;
  // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
      "Input", {1, 3, 3, 2},
      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
      "Filter", {3, 3, 2, 1},
      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
  if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
        .Input("FilterImage")
@@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() {
        .AddIntsArg("strides", {1, 1})
        .AddIntArg("padding", Padding::VALID)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    net.RunOp(D);
    // Transfer output
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
  } else {
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -125,33 +126,34 @@ void TestNHWCSimple3x3VALID() {
        .AddIntsArg("strides", {1, 1})
        .AddIntArg("padding", Padding::VALID)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
  }
  auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
-template<DeviceType D>
+template<DeviceType D, typename T>
 void TestNHWCSimple3x3SAME() {
  OpsTestNet net;
  // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
      "Input", {1, 3, 3, 2},
      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
      "Filter", {3, 3, 2, 1},
      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
  if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
        .Input("FilterImage")
@@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() {
        .AddIntsArg("strides", {1, 1})
        .AddIntArg("padding", Padding::SAME)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
    // Transfer output
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
  } else {
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -176,6 +179,7 @@ void TestNHWCSimple3x3SAME() {
        .AddIntsArg("strides", {1, 1})
        .AddIntArg("padding", Padding::SAME)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
@@ -185,17 +189,17 @@ void TestNHWCSimple3x3SAME() {
      {1, 3, 3, 1},
      {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
 TEST_F(Conv2dOpTest, CPUSimple) {
-  TestNHWCSimple3x3VALID<DeviceType::CPU>();
+  TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
-  TestNHWCSimple3x3SAME<DeviceType::CPU>();
+  TestNHWCSimple3x3SAME<DeviceType::CPU, float>();
 }
 TEST_F(Conv2dOpTest, OPENCLSimple) {
-  TestNHWCSimple3x3VALID<DeviceType::OPENCL>();
+  TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
-  TestNHWCSimple3x3SAME<DeviceType::OPENCL>();
+  TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
 }
 template<DeviceType D>
@@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) {
  TestSimple3x3WithoutBias<DeviceType::NEON>();
 }
-template<DeviceType D>
+template<DeviceType D, typename T>
 void TestNHWCSimple3x3WithoutBias() {
  OpsTestNet net;
  // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
      "Input", {1, 3, 3, 2},
      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
      "Filter", {3, 3, 2, 1},
      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
  if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
@@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() {
        .AddIntsArg("strides", {1, 1})
        .AddIntArg("padding", Padding::VALID)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
    // Transfer output
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
  } else {
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("Input")
@@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() {
        .AddIntsArg("strides", {1, 1})
        .AddIntArg("padding", Padding::VALID)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    // Run
@@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() {
  // Check
  auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.0f});
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
 TEST_F(Conv2dOpTest, CPUWithoutBias) {
-  TestNHWCSimple3x3WithoutBias<DeviceType::CPU>();
+  TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
 }
 TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
-  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL>();
+  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
 }
 template<DeviceType D>
@@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) {
  TestCombined3x3<DeviceType::NEON>();
 }
-template<DeviceType D>
+template<DeviceType D, typename T>
 static void TestNHWCCombined3x3() {
  // Construct graph
  OpsTestNet net;
  // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
      "Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
      "Filter", {3, 3, 2, 2},
      {1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
       1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
       1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f});
-  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
+  net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f});
  if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("InputImage")
@@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() {
        .AddIntsArg("strides", {2, 2})
        .AddIntArg("padding", Padding::SAME)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
  } else {
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("Input")
@@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() {
        .AddIntsArg("strides", {2, 2})
        .AddIntArg("padding", Padding::SAME)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
@@ -388,27 +396,22 @@ static void TestNHWCCombined3x3() {
      {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f,
                     12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f,
                     8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
-TEST_F(Conv2dOpTest, CPUCombined) {
+TEST_F(Conv2dOpTest, CPUStride2) {
-  TestNHWCCombined3x3<DeviceType::CPU>();
+  TestNHWCCombined3x3<DeviceType::CPU, float>();
+}
+TEST_F(Conv2dOpTest, OPENCLStride2) {
+  TestNHWCCombined3x3<DeviceType::OPENCL, float>();
 }
 template<DeviceType D>
 void TestConv1x1() {
  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("Conv2D", "Conv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
  // Add input data
  net.AddInputFromArray<D, float>(
@@ -425,8 +428,37 @@ void TestConv1x1() {
      {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
-  // Run
+  if (D == DeviceType::OPENCL) {
-  net.RunOp(D);
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, float>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
  // Check
  auto expected = CreateTensor<float>(
@@ -445,11 +477,11 @@ TEST_F(Conv2dOpTest, CPUConv1x1) {
  TestConv1x1<DeviceType::CPU>();
 }
-//TEST_F(Conv2dOpTest, OPENCLConv1x1) {
+TEST_F(Conv2dOpTest, OPENCLConv1x1) {
-//  TestConv1x1<DeviceType::OPENCL>();
+  TestConv1x1<DeviceType::OPENCL>();
-//}
+}
-template<DeviceType D>
+template<DeviceType D, typename T>
 static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
@@ -457,11 +489,11 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
    srand(time(NULL));
    // generate random input
-    index_t batch = 3 + rand() % 10;
+    index_t batch = 3 + (rand() % 10);
    index_t height = shape[0];
    index_t width = shape[1];
-    index_t input_channels = shape[2] + rand() % 10;
+    index_t input_channels = shape[2] + (rand() % 10);
-    index_t output_channels = shape[3] + rand() % 10;
+    index_t output_channels = shape[3] + (rand() % 10);
    // Construct graph
    OpsTestNet net;
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -472,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
        .AddIntsArg("strides", {stride_h, stride_w})
        .AddIntArg("padding", type)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    // Add input data
-    net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
-    net.AddRandomInput<D, float>(
+    net.AddRandomInput<D, T>(
        "Filter", {kernel_h, kernel_w, input_channels, output_channels});
-    net.AddRandomInput<D, float>("Bias", {output_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
    // run on cpu
    net.RunOp();
@@ -487,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
    expected.Copy(*net.GetOutput("Output"));
    // run on gpu
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
@@ -499,25 +532,136 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
        .AddIntsArg("strides", {stride_h, stride_w})
        .AddIntArg("padding", type)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    // Run on device
    net.RunOp(D);
-    ImageToBuffer<D>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
  };
-  for (int kernel_size : {3}) {
+  for (int kernel_size : {1, 3}) {
-    for (int stride : {1}) {
+    for (int stride : {1, 2}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
      func(kernel_size, kernel_size, stride, stride, SAME);
    }
  }
 }
 TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 64, 128});
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 32, 64});
 }
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL>({107, 113, 5, 7});
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
+}
+template<DeviceType D>
+static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
+                                      const std::vector<index_t> &filter_shape) {
+  testing::internal::LogToStderr();
+  srand(time(NULL));
+  auto func = [&](int stride_h, int stride_w, Padding padding) {
+    // generate random input
+    index_t batch = 3 + (rand() % 10);
+    index_t height = input_shape[0];
+    index_t width = input_shape[1];
+    index_t kernel_h = filter_shape[0];
+    index_t kernel_w = filter_shape[1];
+    index_t input_channels = filter_shape[2] + (rand() % 10);
+    index_t output_channels = filter_shape[3] + (rand() % 10);
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", padding)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    std::vector<float> float_input_data;
+    GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data);
+    std::vector<float> float_filter_data;
+    GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
+    std::vector<float> float_bias_data;
+    GenerateRandomRealTypeData({output_channels}, float_bias_data);
+    // Add input data
+    net.AddInputFromArray<D, float>("Input", {batch, height, width, input_channels}, float_input_data);
+    net.AddInputFromArray<D, float>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
+    net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+    // run on gpu
+    BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", padding)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
+  };
+  for (int stride : {1, 2}) {
+    func(stride, stride, VALID);
+    func(stride, stride, SAME);
+  }
+}
+TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+                                                {1, 1, 32, 64});
+}
+TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+                                                {3, 3, 32, 64});
+}
+TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+                                                {15, 1, 256, 2});
+}
+TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+                                                {1, 15, 256, 2});
+}
+TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+                                                {7, 7, 3, 64});
+}
+TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
+                                                {1, 1, 5, 7});
+}
+TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
+                                                {3, 3, 5, 7});
 }
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -6,15 +6,21 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(DepthwiseConv2d,
+REGISTER_CPU_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                      DepthwiseConv2dOp<DeviceType::CPU, float>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(DepthwiseConv2d,
+REGISTER_NEON_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                       DepthwiseConv2dOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
-REGISTER_OPENCL_OPERATOR(DepthwiseConv2d,
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                         DepthwiseConv2dOp<DeviceType::OPENCL, float>);
 }  // namespace mace
--- a/mace/ops/fused_conv_2d.cc
+++ b/mace/ops/fused_conv_2d.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#include "mace/ops/fused_conv_2d.h"
+namespace mace {
+REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      FusedConv2dOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D")
+                          .TypeConstraint<half>("T")
+                          .Build(),
+                      FusedConv2dOp<DeviceType::CPU, half>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         FusedConv2dOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         FusedConv2dOp<DeviceType::OPENCL, half>);
+}  // namespace mace
--- a/mace/ops/fused_conv_2d.h
+++ b/mace/ops/fused_conv_2d.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#ifndef MACE_OPS_FUSED_CONV_2D_H_
+#define MACE_OPS_FUSED_CONV_2D_H_
+#include <memory>
+#include "mace/core/operator.h"
+#include "mace/kernels/fused_conv_2d.h"
+#include "mace/ops/conv_pool_2d_base.h"
+namespace mace {
+template <DeviceType D, typename T>
+class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
+ public:
+  FusedConv2dOp(const OperatorDef &op_def, Workspace *ws)
+      : ConvPool2dOpBase<D, T>(op_def, ws),
+        functor_(this->strides_.data(), this->padding_,
+                 this->dilations_.data()) {
+  }
+  bool Run() override {
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *filter = this->Input(FILTER);
+    const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr;
+    Tensor *output = this->Output(OUTPUT);
+    functor_(input, filter, bias, output);
+    return true;
+  }
+ private:
+  kernels::FusedConv2dFunctor<D, T> functor_;
+ protected:
+  OP_INPUT_TAGS(INPUT, FILTER, BIAS);
+  OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace mace
+#endif  // MACE_OPS_FUSED_CONV_2D_H_
--- a/mace/ops/fused_conv_2d_test.cc
+++ b/mace/ops/fused_conv_2d_test.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#include "mace/ops/fused_conv_2d.h"
+#include "mace/ops/ops_test_util.h"
+using namespace mace;
+class FusedConv2dOpTest : public OpsTestBase {};
+template<DeviceType D, typename T>
+void TestNHWCSimple3x3VALID() {
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, T>(
+      "Input", {1, 3, 3, 2},
+      {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
+  net.AddInputFromArray<D, T>(
+      "Filter", {3, 3, 2, 1},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  net.AddInputFromArray<D, T>("Bias", {1}, {-0.1f});
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
+  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
+}
+template<DeviceType D, typename T>
+void TestNHWCSimple3x3SAME() {
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, T>(
+      "Input", {1, 3, 3, 2},
+      {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
+  net.AddInputFromArray<D, T>(
+      "Filter", {3, 3, 2, 1},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  net.AddInputFromArray<D, T>("Bias", {1}, {-0.1f});
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::SAME)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::SAME)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
+  auto expected = CreateTensor<float>(
+      {1, 3, 3, 1},
+      {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
+}
+TEST_F(FusedConv2dOpTest, CPUSimple) {
+  TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
+  TestNHWCSimple3x3SAME<DeviceType::CPU, float>();
+}
+TEST_F(FusedConv2dOpTest, OPENCLSimple) {
+  TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
+  TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
+}
+template<DeviceType D, typename T>
+void TestNHWCSimple3x3WithoutBias() {
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, T>(
+      "Input", {1, 3, 3, 2},
+      {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
+  net.AddInputFromArray<D, T>(
+      "Filter", {3, 3, 2, 1},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
+  // Check
+  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
+}
+TEST_F(FusedConv2dOpTest, CPUWithoutBias) {
+  TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
+}
+TEST_F(FusedConv2dOpTest, OPENCLWithoutBias) {
+  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
+}
+template<DeviceType D>
+void TestConv1x1() {
+  // Construct graph
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {1, 3, 10, 5},
+      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  net.AddInputFromArray<D, float>(
+      "Filter", {1, 1, 5, 2},
+      {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
+  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, float>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
+  // Check
+  auto expected = CreateTensor<float>(
+      {1, 3, 10, 2},
+      {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+}
+TEST_F(FusedConv2dOpTest, CPUConv1x1) {
+  TestConv1x1<DeviceType::CPU>();
+}
+TEST_F(FusedConv2dOpTest, OPENCLConv1x1) {
+  TestConv1x1<DeviceType::OPENCL>();
+}
+template<DeviceType D, typename T>
+static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
+  testing::internal::LogToStderr();
+  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
+                  Padding type) {
+    srand(time(NULL));
+    // generate random input
+    index_t batch = 3 + (rand() % 10);
+    index_t height = shape[0];
+    index_t width = shape[1];
+    index_t input_channels = shape[2] + (rand() % 10);
+    index_t output_channels = shape[3] + (rand() % 10);
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Add input data
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+    // run on gpu
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
+  };
+  for (int kernel_size : {1, 3}) {
+    for (int stride : {1, 2}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
+      func(kernel_size, kernel_size, stride, stride, SAME);
+    }
+  }
+}
+TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) {
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
+}
+template<DeviceType D>
+static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
+  testing::internal::LogToStderr();
+  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
+                  Padding type) {
+    srand(time(NULL));
+    // generate random input
+    index_t batch = 3 + (rand() % 10);
+    index_t height = shape[0];
+    index_t width = shape[1];
+    index_t input_channels = shape[2] + (rand() % 10);
+    index_t output_channels = shape[3] + (rand() % 10);
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    std::vector<float> float_input_data;
+    GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data);
+    std::vector<float> float_filter_data;
+    GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
+    std::vector<float> float_bias_data;
+    GenerateRandomRealTypeData({output_channels}, float_bias_data);
+    // Add input data
+    net.AddInputFromArray<D, float>("Input", {batch, height, width, input_channels}, float_input_data);
+    net.AddInputFromArray<D, float>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
+    net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+    // run on gpu
+    BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.2);
+  };
+  for (int kernel_size : {1, 3}) {
+    for (int stride : {1, 2}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
+    }
+  }
+}
+TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConvNxNS12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64});
+}
--- a/mace/ops/global_avg_pooling.cc
+++ b/mace/ops/global_avg_pooling.cc
@@ -6,11 +6,15 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(GlobalAvgPooling,
+REGISTER_CPU_OPERATOR(OpKeyBuilder("GlobalAvgPooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                      GlobalAvgPoolingOp<DeviceType::CPU, float>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(GlobalAvgPooling,
+REGISTER_NEON_OPERATOR(OpKeyBuilder("GlobalAvgPooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                       GlobalAvgPoolingOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON

--- a/mace/ops/image_to_buffer.cc
+++ b/mace/ops/image_to_buffer.cc
@@ -6,6 +6,14 @@
 namespace mace {
-REGISTER_OPENCL_OPERATOR(ImageToBuffer, ImageToBufferOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ImageToBuffer")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         ImageToBufferOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ImageToBuffer")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         ImageToBufferOp<DeviceType::OPENCL, half>);
 }  //  namespace mace
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -13,6 +13,7 @@
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 namespace mace {
@@ -209,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
                                std::vector<T> &res) {
  std::random_device rd;
  std::mt19937 gen(rd());
-  std::normal_distribution<T> nd(0, 1);
+  std::normal_distribution<float> nd(0, 1);
  index_t size = std::accumulate(shape.begin(), shape.end(), 1,
                                 std::multiplies<index_t>());
  res.resize(size);
-  std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
+  if (DataTypeToEnum<T>::value == DT_HALF) {
+    std::generate(res.begin(), res.end(), [&gen, &nd] { return half_float::half_cast<half>(nd(gen)); });
+  } else {
+    std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
+  }
 }
 template <typename T>
@@ -289,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) {
  EXPECT_DOUBLE_EQ(a, b);
 }
-inline void AssertSameTypeDims(const Tensor &x, const Tensor &y) {
+inline void AssertSameDims(const Tensor &x, const Tensor &y) {
-  ASSERT_EQ(x.dtype(), y.dtype());
  ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs "
                                << "y.shape [ " << ShapeToString(y) << "]";
 }
-template <typename T, bool is_fp = is_floating_point_type<T>::value>
+template <typename EXP_TYPE, typename RES_TYPE, bool is_fp = is_floating_point_type<EXP_TYPE>::value>
 struct Expector;
 // Partial specialization for float and double.
-template <typename T>
+template <typename EXP_TYPE, typename RES_TYPE>
-struct Expector<T, true> {
+struct Expector<EXP_TYPE, RES_TYPE, true> {
-  static void Equal(const T &a, const T &b) { ExpectEqual(a, b); }
+  static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
  static void Equal(const Tensor &x, const Tensor &y) {
-    ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
+    ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
-    AssertSameTypeDims(x, y);
+    ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
+    AssertSameDims(x, y);
    Tensor::MappingGuard x_mapper(&x);
    Tensor::MappingGuard y_mapper(&y);
-    auto a = x.data<T>();
+    auto a = x.data<EXP_TYPE>();
-    auto b = y.data<T>();
+    auto b = y.data<RES_TYPE>();
    for (int i = 0; i < x.size(); ++i) {
      ExpectEqual(a(i), b(i));
    }
  }
  static void Near(const Tensor &x, const Tensor &y, const double abs_err) {
-    ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
+    ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
-    AssertSameTypeDims(x, y);
+    ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
+    AssertSameDims(x, y);
    Tensor::MappingGuard x_mapper(&x);
    Tensor::MappingGuard y_mapper(&y);
-    auto a = x.data<T>();
+    auto a = x.data<EXP_TYPE>();
-    auto b = y.data<T>();
+    auto b = y.data<RES_TYPE>();
    for (int i = 0; i < x.size(); ++i) {
      EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
                                       << " index = " << i;
@@ -334,17 +340,18 @@ template <typename T>
 void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
  static_assert(is_floating_point_type<T>::value,
                "T is not a floating point type");
-  Expector<T>::Near(x, y, abs_err);
+  Expector<T, T>::Near(x, y, abs_err);
 }
-template <typename T>
+template <typename EXP_TYPE, typename RES_TYPE>
-std::string ToString(const T &input) {
+void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
-  std::stringstream ss;
+  static_assert(is_floating_point_type<EXP_TYPE>::value
-  ss << input;
+                    && is_floating_point_type<RES_TYPE>::value,
-  return ss.str();
+                "T is not a floating point type");
+  Expector<EXP_TYPE, RES_TYPE>::Near(x, y, abs_err);
 }
-template <DeviceType D>
+template <DeviceType D, typename T>
 void BufferToImage(OpsTestNet &net,
                   const std::string &input_name,
                   const std::string &output_name,
@@ -353,6 +360,7 @@ void BufferToImage(OpsTestNet &net,
      .Input(input_name)
      .Output(output_name)
      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
  // Run
@@ -361,7 +369,7 @@ void BufferToImage(OpsTestNet &net,
  net.Sync();
 }
-template <DeviceType D>
+template <DeviceType D, typename T>
 void ImageToBuffer(OpsTestNet &net,
                   const std::string &input_name,
                   const std::string &output_name,
@@ -370,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net,
      .Input(input_name)
      .Output(output_name)
      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
  // Run

--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -6,11 +6,29 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(Pooling, PoolingOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      PoolingOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling")
+                          .TypeConstraint<half>("T")
+                          .Build(),
+                      PoolingOp<DeviceType::CPU, half>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(Pooling, PoolingOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("Pooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       PoolingOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
-REGISTER_OPENCL_OPERATOR(Pooling, PoolingOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         PoolingOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         PoolingOp<DeviceType::OPENCL, half>);
 }  //  namespace mace
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);
-    std::vector<index_t> output_shape(4);
-    std::vector<int> paddings(2);
-    std::vector<index_t> filter_shape(4);
-    // TODO(chenghui): is it kind of a hack?
-    filter_shape[0] = input->shape()[1];
-    filter_shape[1] = input->shape()[0];
-    filter_shape[2] = kernels_[0];
-    filter_shape[3] = kernels_[1];
-    kernels::CalcPaddingAndOutputSize(
-        input->shape().data(), filter_shape.data(), this->dilations_.data(),
-        this->strides_.data(), this->padding_, output_shape.data(),
-        paddings.data());
-    output->Resize(output_shape);
    functor_(input, output);
    return true;
  };

--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -28,48 +28,20 @@ TEST_F(PoolingOpTest, MAX_VALID) {
  // Add input data
  net.AddInputFromArray<DeviceType::CPU, float>(
-      "Input", {1, 2, 4, 4},
+      "Input", {1, 4, 4, 2},
-      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
-       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
+       8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
  // Run
  net.RunOp();
  // Check
  auto expected =
-      CreateTensor<float>({1, 2, 2, 2}, {5, 7, 13, 15, 21, 23, 29, 31});
+      CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
-TEST_F(PoolingOpTest, AVG_VALID) {
-  // Construct graph
-  auto &net = test_net();
-  OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntsArg("kernels", {2, 2})
-      .AddIntsArg("strides", {2, 2})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("pooling_type", PoolingType::AVG)
-      .Finalize(net.NewOperatorDef());
-  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
-      "Input", {1, 2, 4, 4},
-      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
-  // Run
-  net.RunOp();
-  // Check
-  auto expected = CreateTensor<float>(
-      {1, 2, 2, 2}, {2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5});
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
-}
 TEST_F(PoolingOpTest, MAX_SAME) {
  // Construct graph
@@ -85,14 +57,14 @@ TEST_F(PoolingOpTest, MAX_SAME) {
      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 1, 3, 3},
+  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
-                               {0, 1, 2, 3, 4, 5, 6, 7, 8});
+                                                {0, 1, 2, 3, 4, 5, 6, 7, 8});
  // Run
  net.RunOp();
  // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 2}, {4, 5, 7, 8});
+  auto expected = CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -112,14 +84,14 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
  // Add input data
  net.AddInputFromArray<DeviceType::CPU, float>(
-      "Input", {1, 1, 4, 4},
+      "Input", {1, 4, 4, 1},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
  // Run
  net.RunOp();
  // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 2}, {10, 11, 14, 15});
+  auto expected = CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -139,42 +111,57 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
  // Add input data
  net.AddInputFromArray<DeviceType::CPU, float>(
-      "Input", {1, 1, 2, 9},
+      "Input", {1, 2, 9, 1},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
  // Run
-  net.RunOp(DeviceType::NEON);
+  net.RunOp();
  // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 5}, {10, 12, 14, 16, 17});
+  auto expected = CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+template<DeviceType D>
-template <DeviceType D>
 static void SimpleMaxPooling3S2() {
  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntArg("pooling_type", PoolingType::MAX)
-      .AddIntsArg("kernels", {3, 3})
-      .AddIntsArg("strides", {2, 2})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
  // Add input data
  net.AddInputFromArray<D, float>(
-      "Input", {1, 1, 3, 9},
+      "Input", {1, 3, 9, 1},
-      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
       14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
-  // Run
-  net.RunOp(D);
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    OpDefBuilder("Pooling", "PoolingTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .AddIntArg("pooling_type", PoolingType::MAX)
+        .AddIntsArg("kernels", {3, 3})
+        .AddIntsArg("strides", {2, 2})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    net.RunOp(D);
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    // Run
+    OpDefBuilder("Pooling", "PoolingTest")
+        .Input("Input")
+        .Output("Output")
+        .AddIntArg("pooling_type", PoolingType::MAX)
+        .AddIntsArg("kernels", {3, 3})
+        .AddIntsArg("strides", {2, 2})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    net.RunOp(D);
+  }
  // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 4}, {20, 22, 24, 26});
+  auto expected = CreateTensor<float>({1, 1, 4, 1}, {20, 22, 24, 26});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -182,15 +169,15 @@ static void SimpleMaxPooling3S2() {
 TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) {
  SimpleMaxPooling3S2<CPU>();
 }
-TEST_F(PoolingOpTest, NEONSimpleMaxPooling3S2) {
-  SimpleMaxPooling3S2<NEON>();
-}
 TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
  SimpleMaxPooling3S2<OPENCL>();
 }
-template <DeviceType D>
+template<DeviceType D, typename T>
-static void AlignedMaxPooling3S2(Padding padding) {
+static void MaxPooling3S2(const std::vector<index_t> &input_shape,
+                          const std::vector<int> strides,
+                          Padding padding) {
  // Construct graph
  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
@@ -198,22 +185,35 @@ static void AlignedMaxPooling3S2(Padding padding) {
      .Output("Output")
      .AddIntArg("pooling_type", PoolingType::MAX)
      .AddIntsArg("kernels", {3, 3})
-      .AddIntsArg("strides", {2, 2})
+      .AddIntsArg("strides", strides)
      .AddIntArg("padding", padding)
      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddRandomInput<D, float>("Input", {3, 128, 64, 64});
+  net.AddRandomInput<D, T>("Input", input_shape);
-  // Run
-  net.RunOp(D);
+  // run on cpu
+  net.RunOp();
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
-  // Run on cpu
+  BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-  net.RunOp();
+  OpDefBuilder("Pooling", "PoolingTest")
+      .Input("InputImage")
+      .Output("OutputImage")
+      .AddIntArg("pooling_type", PoolingType::MAX)
+      .AddIntsArg("kernels", {3, 3})
+      .AddIntsArg("strides", strides)
+      .AddIntArg("padding", padding)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  net.RunOp(D);
+  ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
-  ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001);
+  ExpectTensorNear<T>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
 }
 // TODO(chenghui) : there is a bug.
@@ -223,152 +223,158 @@ static void AlignedMaxPooling3S2(Padding padding) {
 //}
 TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) {
-  AlignedMaxPooling3S2<OPENCL>(Padding::VALID);
+  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
-  AlignedMaxPooling3S2<OPENCL>(Padding::SAME);
+  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
+  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
+  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
+}
+TEST_F(PoolingOpTest, OPENCLHalfAlignedMaxPooling3S2) {
+  MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
+  MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
+  MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
+  MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
 }
-template <DeviceType D>
+TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
-static void UnalignedMaxPooling3S2(Padding padding) {
+  MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {1, 1}, Padding::VALID);
+  MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {2, 2}, Padding::VALID);
+  MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {1, 1}, Padding::SAME);
+  MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {2, 2}, Padding::SAME);
+}
+TEST_F(PoolingOpTest, AVG_VALID) {
  // Construct graph
-  OpsTestNet net;
+  auto &net = test_net();
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("Input")
      .Output("Output")
-      .AddIntArg("pooling_type", PoolingType::MAX)
+      .AddIntsArg("kernels", {2, 2})
-      .AddIntsArg("kernels", {3, 3})
      .AddIntsArg("strides", {2, 2})
-      .AddIntArg("padding", padding)
+      .AddIntArg("padding", Padding::VALID)
      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("pooling_type", PoolingType::AVG)
      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddRandomInput<D, float>("Input", {3, 113, 43, 47});
+  net.AddInputFromArray<DeviceType::CPU, float>(
-  // Run
+      "Input", {1, 4, 4, 2},
-  net.RunOp(D);
+      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
-  Tensor expected;
+       8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
-  expected.Copy(*net.GetOutput("Output"));
-  // Run on cpu
+  // Run
  net.RunOp();
-  ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001);
+  // Check
-}
+  auto expected = CreateTensor<float>(
+      {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
-// TODO(chenghui) : there is a bug.
-//TEST_F(PoolingOpTest, NEONUnalignedMaxPooling3S2) {
-//  UnalignedMaxPooling3S2<NEON>();
-//}
-TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
-  UnalignedMaxPooling3S2<OPENCL>(Padding::VALID);
-  UnalignedMaxPooling3S2<OPENCL>(Padding::SAME);
 }
-template <DeviceType D>
+template<DeviceType D>
 static void SimpleAvgPoolingTest() {
  // Construct graph
  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {1, 2, 8, 1},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+  BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
  OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
+      .Input("InputImage")
-      .Output("Output")
+      .Output("OutputImage")
      .AddIntArg("pooling_type", PoolingType::AVG)
      .AddIntsArg("kernels", {2, 2})
      .AddIntsArg("strides", {2, 2})
      .AddIntArg("padding", Padding::SAME)
      .AddIntsArg("dilations", {1, 1})
      .Finalize(net.NewOperatorDef());
-  // Add input data
-  net.AddInputFromArray<D, float>(
-      "Input", {1, 1, 2, 8},
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
  // Run
  net.RunOp(D);
+  ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
  // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 4}, {4.5, 6.5, 8.5, 10.5});
+  auto expected = CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
-TEST_F(PoolingOpTest, NEONSimpleAvgPooling) {
-  SimpleAvgPoolingTest<NEON>();
-}
 TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
  SimpleAvgPoolingTest<OPENCL>();
 }
-template <DeviceType D>
+template<DeviceType D, typename T>
-static void AlignedAvgPoolingTest(Padding padding) {
+static void AvgPoolingTest(const std::vector<index_t> &shape,
+                                  const std::vector<int> &kernels,
+                                  const std::vector<int> &strides,
+                                  Padding padding) {
  // Construct graph
  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("Input")
      .Output("Output")
      .AddIntArg("pooling_type", PoolingType::AVG)
-      .AddIntsArg("kernels", {4, 4})
+      .AddIntsArg("kernels", kernels)
-      .AddIntsArg("strides", {4, 4})
+      .AddIntsArg("strides", strides)
      .AddIntArg("padding", padding)
      .AddIntsArg("dilations", {1, 1})
      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddRandomInput<D, float>("Input", {3, 128, 15, 15});
+  net.AddRandomInput<D, float>("Input", shape);
-  // Run
-  net.RunOp(D);
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
-  // Run on cpu
+  // run on cpu
  net.RunOp();
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
-  ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5);
+  BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-}
-TEST_F(PoolingOpTest, NEONAlignedAvgPooling) {
-  AlignedAvgPoolingTest<NEON>(Padding::VALID);
-  AlignedAvgPoolingTest<NEON>(Padding::SAME);
-}
-TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
-  AlignedAvgPoolingTest<OPENCL>(Padding::VALID);
-  AlignedAvgPoolingTest<OPENCL>(Padding::SAME);
-}
-template <DeviceType D>
-static void UnAlignedAvgPoolingTest(Padding padding) {
-  // Construct graph
-  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
+      .Input("InputImage")
-      .Output("Output")
+      .Output("OutputImage")
      .AddIntArg("pooling_type", PoolingType::AVG)
-      .AddIntsArg("kernels", {7, 7})
+      .AddIntsArg("kernels", kernels)
-      .AddIntsArg("strides", {7, 7})
+      .AddIntsArg("strides", strides)
      .AddIntArg("padding", padding)
      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  // Add input data
-  net.AddRandomInput<D, float>("Input", {3, 128, 31, 37});
-  // Run
  net.RunOp(D);
-  Tensor expected;
+  ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
-  expected.Copy(*net.GetOutput("Output"));
-  // Run on cpu
+  ExpectTensorNear<float, T>(expected, *net.GetOutput("OPENCLOutput"), 0.01);
-  net.RunOp();
+}
-  ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5);
+TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
+  AvgPoolingTest<OPENCL, float>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
+  AvgPoolingTest<OPENCL, float>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
 }
-TEST_F(PoolingOpTest, NEONUnAlignedAvgPooling) {
+TEST_F(PoolingOpTest, OPENCLHalfAlignedAvgPooling) {
-  UnAlignedAvgPoolingTest<NEON>(Padding::VALID);
+  AvgPoolingTest<OPENCL, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
-  UnAlignedAvgPoolingTest<NEON>(Padding::SAME);
+  AvgPoolingTest<OPENCL, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
+}
+TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) {
+  AvgPoolingTest<OPENCL, float>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID);
+  AvgPoolingTest<OPENCL, float>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME);
+}
+TEST_F(PoolingOpTest, OPENCLHalfAlignedLargeKernelAvgPooling) {
+  AvgPoolingTest<OPENCL, half>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID);
+  AvgPoolingTest<OPENCL, half>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME);
 }
 TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) {
-  UnAlignedAvgPoolingTest<OPENCL>(Padding::VALID);
+  AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::VALID);
-  UnAlignedAvgPoolingTest<OPENCL>(Padding::SAME);
+  AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::SAME);
 }
+TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
+  AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::VALID);
+  AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::SAME);
+}
--- a/mace/ops/relu.cc
+++ b/mace/ops/relu.cc
@@ -6,10 +6,16 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Relu")
+                          .TypeConstraint<float>("T")
+                          .Build(),
+                      ReluOp<DeviceType::CPU, float>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("Relu")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       ReluOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")

--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -6,14 +6,26 @@
 namespace mace {
-REGISTER_CPU_OPERATOR(ResizeBilinear, ResizeBilinearOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("ResizeBilinear")
+                          .TypeConstraint<float>("T")
+                          .Build(),
+                      ResizeBilinearOp<DeviceType::CPU, float>);
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(ResizeBilinear,
+REGISTER_NEON_OPERATOR(OpKeyBuilder("ResizeBilinear")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                       ResizeBilinearOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
-REGISTER_OPENCL_OPERATOR(ResizeBilinear,
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ResizeBilinear")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                         ResizeBilinearOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ResizeBilinear")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         ResizeBilinearOp<DeviceType::OPENCL, half>);
 }  //  namespace mace
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -19,18 +19,30 @@ static void ResizeBilinearBenchmark(int iters,
  mace::testing::StopTiming();
  OpsTestNet net;
-  OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
-      .Input("Input")
-      .Input("OutSize")
-      .Output("Output")
-      .AddIntsArg("size", {output_height, output_width})
-      .Finalize(net.NewOperatorDef());
  // Add input data
  net.AddRandomInput<D, float>("Input",
-                               {batch, channels, input_height, input_width});
+                               {batch, input_height, input_width, channels});
  net.AddInputFromArray<D, index_t>("OutSize", {2},
                                    {output_height, output_width});
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
+      .Input("InputImage")
+      .Input("OutSize")
+      .Output("OutputImage")
+      .AddIntsArg("size", {output_height, output_width})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
+      .Input("Input")
+      .Input("OutSize")
+      .Output("Output")
+      .AddIntsArg("size", {output_height, output_width})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  }
  // Warm-up
  for (int i = 0; i < 5; ++i) {
@@ -58,9 +70,12 @@ static void ResizeBilinearBenchmark(int iters,
 #define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1, TYPE)        \
  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, CPU);  \
-  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, NEON); \
  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, OPENCL);
+// SNPE 835 GPU: 6870us
+BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, half);
+BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, float);
 BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15, float);
 BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30, float);
 BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60, float);

--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -23,14 +23,14 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
  // Add input data
  vector<float> input(24);
  std::iota(begin(input), end(input), 0);
-  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 2, 4}, input);
+  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
  net.AddInputFromArray<DeviceType::CPU, int>("OutSize", {2}, {1, 2});
  // Run
  net.RunOp();
  // Check
-  auto expected = CreateTensor<float>({1, 3, 1, 2}, {0, 2, 8, 10, 16, 18});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -49,14 +49,14 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
  // Add input data
  vector<float> input(24);
  std::iota(begin(input), end(input), 0);
-  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 2, 4}, input);
+  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
  net.AddInputFromArray<DeviceType::CPU, int>("OutSize", {2}, {1, 2});
  // Run
  net.RunOp();
  // Check
-  auto expected = CreateTensor<float>({1, 3, 1, 2}, {0, 3, 8, 11, 16, 19});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -65,6 +65,7 @@ template <DeviceType D>
 void TestRandomResizeBilinear() {
  srand(time(nullptr));
  testing::internal::LogToStderr();
  for (int round = 0; round < 10; ++round) {
    int batch = 1 + rand() % 5;
    int channels = 1 + rand() % 100;
@@ -72,39 +73,54 @@ void TestRandomResizeBilinear() {
    int width = 1 + rand() % 100;
    int in_height = 1 + rand() % 100;
    int in_width = 1 + rand() % 100;
+    int align_corners = rand() % 1;
    // Construct graph
    OpsTestNet net;
-    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
-        .Input("Input")
-        .Input("OutSize")
-        .Output("Output")
-        .AddIntArg("align_corners", 1)
-        .AddIntsArg("size", {height, width})
-        .Finalize(net.NewOperatorDef());
    // Add input data
    net.AddRandomInput<D, float>("Input",
-                                 {batch, channels, in_height, in_width});
+                                 {batch, in_height, in_width, channels});
    net.AddInputFromArray<D, int>("OutSize", {2}, {height, width});
-    // Run
+    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
-    net.RunOp(D);
+      .Input("Input")
-    Tensor actual;
+      .Input("OutSize")
-    actual.Copy(*net.GetOutput("Output"));
+      .Output("Output")
+      .AddIntArg("align_corners", align_corners)
+      .AddIntsArg("size", {height, width})
+      .Finalize(net.NewOperatorDef());
    // Run on CPU
    net.RunOp(DeviceType::CPU);
-    Tensor *expected = net.GetOutput("Output");
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+    if (D == DeviceType::OPENCL) {
+      BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+      OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
+        .Input("InputImage")
+        .Input("OutSize")
+        .Output("OutputImage")
+        .AddIntArg("align_corners", align_corners)
+        .AddIntsArg("size", {height, width})
+        .Finalize(net.NewOperatorDef());
+      // Run
+      net.RunOp(D);
+      ImageToBuffer<D, float>(net, "OutputImage", "DeviceOutput", kernels::BufferType::IN_OUT);
+    } else {
+      // TODO support NEON
+    }
    // Check
-    ExpectTensorNear<float>(*expected, actual, 0.001);
+    ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 0.001);
  }
 }
+/*
 TEST_F(ResizeBilinearTest, NEONRandomResizeBilinear) {
  TestRandomResizeBilinear<DeviceType::NEON>();
 }
+*/
 TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) {
  TestRandomResizeBilinear<DeviceType::OPENCL>();

--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -6,6 +6,9 @@
 namespace mace {
-REGISTER_OPENCL_OPERATOR(SpaceToBatchND, SpaceToBatchNDOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("SpaceToBatchND")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         SpaceToBatchNDOp<DeviceType::OPENCL, float>);
 }  //  namespace mace
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -67,12 +67,20 @@ message NodeInput {
  optional int32 output_port = 2;
 }
+message OutputShape {
+  repeated int64 dims = 1;
+}
 message OperatorDef {
  repeated string input = 1;
  repeated string output = 2;
  optional string name = 3;
  optional string type = 4;
  repeated Argument arg = 5;
+  repeated OutputShape output_shape = 6;
+  // Memory optimization: only support one single output op
+  optional int32 mem_id = 10 [default = -1];
  // for hexagon mace-nnlib
  optional uint32 node_id = 100;
@@ -82,6 +90,16 @@ message OperatorDef {
  repeated int32 out_max_byte_size = 104; // only support 32-bit len
 }
+// for memory optimization
+message MemoryBlock {
+  optional int32 mem_id = 1;
+  optional uint32 x = 2;
+  optional uint32 y = 3;
+}
+message MemoryArena {
+  repeated MemoryBlock mem_block = 1;
+}
 // for hexagon mace-nnlib
 message InputInfo {
  optional string name = 1;

--- a/mace/python/tools/tf_converter.py
+++ b/mace/python/tools/tf_converter.py
@@ -21,7 +21,7 @@ def main(unused_args):
  if FLAGS.runtime == 'dsp':
    output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
-      input_graph_def, FLAGS.input_node, FLAGS.output_node)
+      input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize)
  else:
    output_graph_def = tf_converter_lib.convert_to_mace_pb(
      input_graph_def)
@@ -62,6 +62,11 @@ def parse_args():
    type=str,
    default="softmax",
    help="e.g., softmax")
+  parser.add_argument(
+    "--prequantize",
+    type=bool,
+    default=False,
+    help="e.g., False")
  return parser.parse_known_args()

--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -18,15 +18,6 @@ def convert_tensor(op, tensor):
  tensor.name = op.outputs[0].name
  shape = list(tf_tensor.shape)
-  if (op.name.find('pointwise_kernel') != -1 or
-          op.name.find('depthwise_kernel') != -1 or
-        op.name.endswith('weights') or
-        op.name.endswith('kernel')) \
-          and op.outputs[0].consumers()[0].type.find('Conv') != -1:
-    if op.outputs[0].consumers()[0].get_attr('data_format') == 'NHWC':
-      tf_tensor = np.transpose(tf_tensor, axes=(3, 2, 0, 1))
-      shape = [shape[3], shape[2], shape[0], shape[1]]
-      # print (tensor.name, shape)
  tensor.dims.extend(shape)
  tf_dt = op.get_attr('dtype')
@@ -66,6 +57,12 @@ def convert_ops(unresolved_ops, net_def):
      op_def.type = first_op.type
    op_def.input.extend([input.name for input in first_op.inputs])
    op_def.output.extend([output.name for output in first_op.outputs])
+    output_shapes = []
+    for output in first_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
    padding_arg = op_def.arg.add()
    padding_arg.name = 'padding'
    padding_arg.i = padding_mode[first_op.get_attr('padding')]
@@ -74,7 +71,7 @@ def convert_ops(unresolved_ops, net_def):
    strides_arg.ints.extend(first_op.get_attr('strides')[1:3])
    data_format_arg = op_def.arg.add()
    data_format_arg.name = 'data_format'
-    data_format_arg.s = 'NCHW'
+    data_format_arg.s = 'NHWC'
    if ops_count >= 2 and unresolved_ops[1].type == 'BiasAdd':
      bias_add_op = unresolved_ops[1]
@@ -105,6 +102,12 @@ def convert_ops(unresolved_ops, net_def):
    op_def.type = 'BatchNorm'
    op_def.input.extend([input_name, gamma, beta, mean, variance, epsilon])
    op_def.output.extend([output.name for output in add_1_op.outputs])
+    output_shapes = []
+    for output in add_1_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
    resolved_count = 7
  elif first_op.type == 'Relu6':
@@ -113,6 +116,12 @@ def convert_ops(unresolved_ops, net_def):
    op_def.type = 'Relu'
    op_def.input.extend([input.name for input in first_op.inputs])
    op_def.output.extend([output.name for output in first_op.outputs])
+    output_shapes = []
+    for output in first_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
    max_limit_arg = op_def.arg.add()
    max_limit_arg.name = 'max_limit'
    max_limit_arg.f = 6
@@ -122,6 +131,12 @@ def convert_ops(unresolved_ops, net_def):
    op_def.type = 'Pooling'
    op_def.input.extend([input.name for input in first_op.inputs])
    op_def.output.extend([output.name for output in first_op.outputs])
+    output_shapes = []
+    for output in first_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
    pooling_type_arg = op_def.arg.add()
    pooling_type_arg.name = 'pooling_type'
    pooling_type_arg.i = pooling_type_mode[first_op.type]
@@ -136,21 +151,46 @@ def convert_ops(unresolved_ops, net_def):
    kernels_arg.ints.extend(first_op.get_attr('ksize')[1:3])
    data_format_arg = op_def.arg.add()
    data_format_arg.name = 'data_format'
-    data_format_arg.s = 'NCHW'
+    data_format_arg.s = 'NHWC'
  elif first_op.type == 'Add':
    op_def = net_def.op.add()
    op_def.name = first_op.name
    op_def.type = "AddN"
    op_def.input.extend([input.name for input in first_op.inputs])
    op_def.output.extend([output.name for output in first_op.outputs])
-  elif first_op.type in ['Relu', 'ResizeBilinear', 'SpaceToBatchND', 'BatchToSpaceND']:
+    output_shapes = []
+    for output in first_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
+  elif first_op.type == 'ConcatV2':
+    op_def = net_def.op.add()
+    op_def.name = first_op.name
+    op_def.type = "Concat"
+    op_def.input.extend([input.name for input in first_op.inputs])
+    op_def.output.extend([output.name for output in first_op.outputs])
+    output_shapes = []
+    for output in first_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
+  elif first_op.type in ['Relu', 'ResizeBilinear', 'SpaceToBatchND',
+                         'BatchToSpaceND', 'BiasAdd', 'FusedBatchNorm']:
    op_def = net_def.op.add()
    op_def.name = first_op.name
    op_def.type = first_op.type
    op_def.input.extend([input.name for input in first_op.inputs])
    op_def.output.extend([output.name for output in first_op.outputs])
+    output_shapes = []
+    for output in first_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
  else:
-    raise Exception('Unknown Op: ' + first_op.name)
+    raise Exception('Unknown Op: %s, type: %s' % (first_op.name, first_op.type))
    pass
  for i in range(resolved_count):

--- a/mace/python/tools/tf_dsp_converter_lib.py
+++ b/mace/python/tools/tf_dsp_converter_lib.py
@@ -5,7 +5,7 @@ from dsp_ops import DspOps
 from mace.python.tools import graph_util
 # converter --input ../libcv/quantized_icnet.pb --output quantized_icnet_dsp.pb \
-# --runtime dsp --input_dim input_node,1,480,480,3 --output_node icnet/output_node
+# --runtime dsp --input_node input_node --output_node output_node
 padding_mode = {
  'NA': 0,
@@ -208,8 +208,8 @@ def reverse_batch_to_space_and_biasadd(net_def):
            for follow_op in follow_ops:
              new_follow_op = mace_pb2.OperatorDef()
              new_follow_op.CopyFrom(follow_op)
-              for i in range(len(follow_op.input)):
+              for i in xrange(len(follow_op.input)):
-                for k in range(3):
+                for k in xrange(3):
                  if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k):
                    new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k)
              new_ops.append(new_follow_op)
@@ -220,9 +220,7 @@ def reverse_batch_to_space_and_biasadd(net_def):
  new_net_def = mace_pb2.NetDef()
  new_net_def.tensors.extend(tensor_map.values())
-  for op in net_def.op:
+  new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
-    if op.name not in skip_ops:
-      new_net_def.op.extend([op])
  new_net_def.op.extend(new_ops)
  return new_net_def
@@ -249,29 +247,101 @@ def add_node_id(net_def):
  return net_def
-def add_input_output_info(net_def, input_node, output_node, graph):
+def add_input_output_info(net_def, input_node, output_node, graph, dtype):
  input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0))
  output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0))
-  for op in net_def.op:
+  input_info = net_def.input_info.add()
-    if op.name == input_node:
+  input_info.dims.extend(input_tensor.shape.as_list())
+  input_info.data_type = dtype
+  if dtype == mace_pb2.DT_UINT8:
+    for i in xrange(2):
      input_info = net_def.input_info.add()
-      input_info.name = op.name
+      input_info.dims.extend([1,1,1,1])
-      input_info.node_id = op.node_id
+      input_info.data_type = mace_pb2.DT_FLOAT
-      input_info.dims.extend(input_tensor.shape.as_list())
-      input_info.max_byte_size = max_elem_size(input_tensor)
+  output_info = net_def.output_info.add()
-      input_info.data_type = find_dtype(input_tensor.dtype)
+  output_info.dims.extend(output_tensor.shape.as_list())
-    elif op.name == output_node:
+  output_info.data_type = dtype
+  if dtype == mace_pb2.DT_UINT8:
+    for i in xrange(2):
      output_info = net_def.output_info.add()
-      output_info.name = op.name
+      output_info.dims.extend([1,1,1,1])
-      output_info.node_id = op.node_id
+      output_info.data_type = mace_pb2.DT_FLOAT
-      output_info.dims.extend(output_tensor.shape.as_list())
-      output_info.max_byte_size = max_elem_size(output_tensor)
-      output_info.data_type = find_dtype(output_tensor.dtype)
  return net_def
-def convert_to_mace_pb(input_graph_def, input_node, output_node):
+def strip_input_quantize_and_output_dequantize(net_def, input_node, output_node):
+  tensor_map = {}
+  for tensor in net_def.tensors:
+    tensor_map[tensor.name] = tensor
+  op_map = {}
+  for op in net_def.op:
+    op_map[op.name] = op
+  consumers = {}
+  for op in net_def.op:
+    for ipt in op.input:
+      if ipt not in consumers:
+        consumers[ipt] = []
+      consumers[ipt].append(op)
+  skip_ops = set()
+  new_ops = []
+  skip_tensors = set()
+  # INPUT->Flatten->Minf, Maxf->Quantize
+  for op in net_def.op:
+    if op.type == 'INPUT':
+      input_op = op
+      flatten_op = None
+      quantize_op = None
+      for o in consumers[get_tensor_name_from_op(input_op.name, 0)]:
+        if o.type == 'Flatten':
+          flatten_op = o
+        elif o.type == 'Quantize':
+          quantize_op = o
+      if quantize_op is not None:
+        minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)]
+        skip_ops = skip_ops.union([input_op.name, flatten_op.name, minf_op.name, maxf_op.name, quantize_op.name])
+        skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
+        new_input_op = mace_pb2.OperatorDef()
+        new_input_op.name = input_op.name
+        new_input_op.type = input_op.type
+        new_input_op.padding = input_op.padding
+        new_input_op.out_max_byte_size.extend([input_op.out_max_byte_size[0]/4, 4, 4])
+        new_ops.append(new_input_op)
+        for follow_op in consumers[get_tensor_name_from_op(quantize_op.name, 0)]:
+          new_follow_op = mace_pb2.OperatorDef()
+          new_follow_op.CopyFrom(follow_op)
+          for i in xrange(len(follow_op.input)):
+            for k in xrange(3):
+              if new_follow_op.input[i] == get_tensor_name_from_op(quantize_op.name, k):
+                new_follow_op.input[i] = get_tensor_name_from_op(input_op.name, k)
+          new_ops.append(new_follow_op)
+          skip_ops.add(follow_op.name)
+    elif op.type == 'OUTPUT':
+      output_op = op
+      dequantize_op = get_node_from_map(op_map, output_op.input[0])
+      if dequantize_op.type == 'Dequantize':
+        skip_ops = skip_ops.union([dequantize_op.name, output_op.name])
+        new_output_op = mace_pb2.OperatorDef()
+        new_output_op.name = output_op.name
+        new_output_op.type = output_op.type
+        new_output_op.input.extend(dequantize_op.input)
+        new_ops.append(new_output_op)
+  new_net_def = mace_pb2.NetDef()
+  new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors])
+  new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
+  new_net_def.op.extend(new_ops)
+  return new_net_def
+def convert_to_mace_pb(input_graph_def, input_node, output_node, prequantize=False):
  """
    nnlib does not have batch norm, so use tensorflow optimizer to fold
     batch norm with convolution. The fold optimization reorders ops, so
@@ -298,10 +368,18 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node):
      add_output_node(net_def, output_node)
      # optimized_net_def = reverse_batch_to_space_and_biasadd(net_def)
+      if prequantize:
+        net_def = strip_input_quantize_and_output_dequantize(net_def, input_node, output_node)
      sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__')
      net_def_with_node_id = add_node_id(sorted_net_def)
-      final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph)
+      if prequantize:
+        dtype = mace_pb2.DT_UINT8
+      else:
+        dtype = mace_pb2.DT_FLOAT
+      final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype)
  return final_net_def
--- a/mace/python/tools/tf_ops_stats.py
+++ b/mace/python/tools/tf_ops_stats.py
@@ -68,7 +68,7 @@ def main(unused_args):
          if input_name.endswith('weights:0') and input_name in tensor_shapes:
            ksize = tensor_shapes[input_name]
            break
-        print('%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' % (op.type, padding, strides, ksize, data_format, op.inputs[0].shape.as_list(), op.outputs[0].shape.as_list()))
+        print('%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' % (op.type, padding, strides, ksize, data_format, op.inputs[0].shape, op.outputs[0].shape))
        key = '%s(padding=%s, strides=%s, ksize=%s, format=%s)' % (op.type, padding, strides, ksize, data_format)
        hist_inc(stats, key)
      elif op.type in ['FusedResizeAndPadConv2D']:
@@ -92,6 +92,7 @@ def main(unused_args):
            size = tensor_values[input_name]
            break
        key = '%s(size=%s, align_corners=%s)' % (op.type, size, align_corners)
+        print(key)
        hist_inc(stats, key)
      elif op.type in ['AvgPool', 'MaxPool']:
        padding = op.get_attr('padding')

--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -6,6 +6,7 @@
 #define MACE_UTILS_UTILS_H_
 #include <sys/time.h>
+#include <sstream>
 namespace mace {
 template <typename Integer>
@@ -40,5 +41,12 @@ inline int64_t NowInMicroSec() {
  return static_cast<int64_t>(tv.tv_sec * 1000000 + tv.tv_usec);
 }
+template <typename T>
+inline std::string ToString(T v) {
+  std::ostringstream ss;
+  ss << v;
+  return ss.str();
+}
 }  //  namespace mace
 #endif  //  MACE_UTILS_UTILS_H_
--- a/tools/bazel-adb-run.sh
+++ b/tools/bazel-adb-run.sh
@@ -22,7 +22,10 @@ ANDROID_ABI=arm64-v8a
 STRIP=""
 STRIP="--strip always"
-bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI
+# for profiling
+bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI --define profiling=true
+#bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI
 if [ $? -ne 0 ]; then
  exit 1
 fi