diff --git a/mace/BUILD b/mace/BUILD
index 1b95aae048469510fbe8c5d272602519689408e7..dbe38d6dad5658edc052ec77ec39be41ece8a7fc 100644
--- a/mace/BUILD
+++ b/mace/BUILD
@@ -23,3 +23,11 @@ config_setting(
     },
     visibility = ["//visibility:public"],
 )
+
+config_setting(
+    name = "is_profiling",
+    define_values = {
+        "profiling": "true",
+    },
+    visibility = ["//visibility:public"],
+)
diff --git a/mace/core/BUILD b/mace/core/BUILD
index 4b6bb68275188ef9c4b5f269ffe3982481c7162c..6f1af8a54e3dbab2f14d30c1b6116aabe1bf183e 100644
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -7,7 +7,7 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//mace:mace.bzl", "if_android")
+load("//mace:mace.bzl", "if_android", "if_profiling")
 
 cc_library(
     name = "opencl_runtime",
@@ -19,7 +19,7 @@ cc_library(
         "runtime/opencl/cl2.hpp",
         "runtime/opencl/*.h",
     ]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11"] + if_profiling(["-D__ENABLE_PROFILING"]),
     deps = [
         ":logging",
         "@opencl_headers//:opencl20_headers",
diff --git a/mace/core/half.h b/mace/core/half.h
index dde806fb153f76982f26f1c9d6beb28eab516ab2..9df24bd43956aa56b5de833800d63cdda5281269 100644
--- a/mace/core/half.h
+++ b/mace/core/half.h
@@ -1098,7 +1098,7 @@ namespace half_float
 
 		/// Conversion constructor.
 		/// \param rhs float to convert
-		explicit half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
+		half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
 	
 		/// Conversion to single-precision.
 		/// \return single precision value representing expression value
diff --git a/mace/core/opencl_allocator.cc b/mace/core/opencl_allocator.cc
index 3b393542281266a4564767e732ea703c4371e738..0c4cf8f0f87069d20650622c578308983d61560b 100644
--- a/mace/core/opencl_allocator.cc
+++ b/mace/core/opencl_allocator.cc
@@ -13,6 +13,7 @@ namespace {
 static cl_channel_type DataTypeToCLChannelType(const DataType t) {
   switch (t) {
     case DT_HALF:
+      return CL_HALF_FLOAT;
     case DT_FLOAT:
       return CL_FLOAT;
     case DT_INT8:
@@ -53,10 +54,11 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
   cl_int error;
   cl::Image2D *cl_image =
       new cl::Image2D(OpenCLRuntime::Get()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ,
+                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                       img_format,
                       image_shape[0], image_shape[1],
                       0, nullptr, &error);
+  MACE_CHECK(error == CL_SUCCESS);
 
   return cl_image;
 }
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 97be7cd11c92065fa8f8016d4ce7c18a6db5440c..e2e8936b62b46e164e1508ae08e2f998f8e12b32 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -6,6 +6,24 @@
 
 namespace mace {
 
+
+OpKeyBuilder::OpKeyBuilder(const char *op_name): op_name_(op_name) {}
+
+OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name,
+                                           const DataType allowed) {
+  type_constraint_[attr_name] = allowed;
+  return *this;
+}
+
+const std::string OpKeyBuilder::Build() {
+  static const std::vector<std::string> type_order = {"T"};
+  std::string key = op_name_;
+  for (auto type : type_order) {
+    key += type + "_" + DataTypeToString(type_constraint_[type]);
+  }
+  return key;
+}
+
 std::map<int32_t, OperatorRegistry *> *gDeviceTypeRegistry() {
   static std::map<int32_t, OperatorRegistry *> g_device_type_registry;
   return &g_device_type_registry;
@@ -33,7 +51,14 @@ unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
                                         Workspace *ws,
                                         DeviceType type) {
   OperatorRegistry *registry = gDeviceTypeRegistry()->at(type);
-  return registry->Create(operator_def.type(), operator_def, ws);
+  const int dtype = ArgumentHelper::GetSingleArgument<OperatorDef, int>(operator_def,
+                                                                        "T",
+                                                                        static_cast<int>(DT_FLOAT));
+  return registry->Create(OpKeyBuilder(operator_def.type().data())
+                              .TypeConstraint("T", static_cast<DataType>(dtype))
+                              .Build(),
+                          operator_def,
+                          ws);
 }
 
 OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws)
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 8625d2802d57aea8e64ca7f004b8fbe17885168f..6ee4a9c4d2c637fd7b60c070355c02e155db7a01 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -134,6 +134,29 @@ struct DeviceTypeRegisterer {
   }
 };
 
+class OpKeyBuilder {
+ public:
+  explicit OpKeyBuilder(const char *op_name);
+
+  OpKeyBuilder &TypeConstraint(const char *attr_name, const DataType allowed);
+
+  template <typename T>
+  OpKeyBuilder &TypeConstraint(const char *attr_name);
+
+  const std::string Build();
+
+ private:
+  std::string op_name_;
+  std::map<std::string, DataType> type_constraint_;
+};
+
+template <typename T>
+OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
+  return this->TypeConstraint(attr_name, DataTypeToEnum<T>::value);
+}
+
+
+
 #define MACE_REGISTER_DEVICE_TYPE(type, registry_function)         \
   namespace {                                                      \
   static DeviceTypeRegisterer MACE_ANONYMOUS_VARIABLE(DeviceType)( \
diff --git a/mace/core/registry.h b/mace/core/registry.h
index 9a61ba1247f9a6227c69ed8e665bc7603b2f6c57..c92ebb123f03c8410129aa7ade5057e4eabe5195 100644
--- a/mace/core/registry.h
+++ b/mace/core/registry.h
@@ -106,10 +106,10 @@ class Registerer {
   }
 
 #define MACE_REGISTER_CREATOR(RegistryName, key, ...) \
-  MACE_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
+  MACE_REGISTER_TYPED_CREATOR(RegistryName, key, __VA_ARGS__)
 
 #define MACE_REGISTER_CLASS(RegistryName, key, ...) \
-  MACE_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
+  MACE_REGISTER_TYPED_CLASS(RegistryName, key, __VA_ARGS__)
 
 }  // namespace mace
 
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 4f95a9e7abd446ec8839b1998e13e5c7594dfd97..488b291d6df1061f95cccd0f89a492046eb4aa08 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -79,14 +79,16 @@ OpenCLRuntime *OpenCLRuntime::Get() {
       return;
     }
 
+    cl_command_queue_properties properties = 0;
+#ifdef __ENABLE_PROFILING
+    enable_profiling_ = true;
+    profiling_ev_.reset(new cl::Event());
+    properties = CL_QUEUE_PROFILING_ENABLE;
+#endif
+
     // a context is like a "runtime link" to the device and platform;
     // i.e. communication is possible
     cl::Context context({gpu_device});
-    cl_command_queue_properties properties = 0;
-    if (enable_profiling_) {
-      profiling_ev_.reset(new cl::Event());
-      properties = CL_QUEUE_PROFILING_ENABLE;
-    }
     cl::CommandQueue command_queue(context, gpu_device, properties);
     instance = new OpenCLRuntime(context, gpu_device, command_queue);
 
@@ -104,12 +106,12 @@ cl::Event* OpenCLRuntime::GetDefaultEvent() {
 }
 
 cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() {
-  MACE_CHECK(enable_profiling_, "should enable profiling first.");
+  MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
   return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
 }
 
 cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() {
-  MACE_CHECK(enable_profiling_, "should enable profiling first.");
+  MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
   return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
 }
 
@@ -139,6 +141,7 @@ const std::map<std::string, std::string>
     OpenCLRuntime::program_map_ = {
   {"addn", "addn.cl"},
   {"batch_norm", "batch_norm.cl"},
+  {"conv_2d", "conv_2d.cl"},
   {"conv_2d_1x1", "conv_2d_1x1.cl"},
   {"conv_2d_3x3", "conv_2d_3x3.cl"},
   {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"},
diff --git a/mace/core/types.cc b/mace/core/types.cc
index 08e5097464624fd345d1753bfc73544a4e886f5f..5ecb5410541e36b27f83fa4e46d56956aacc1f2f 100644
--- a/mace/core/types.cc
+++ b/mace/core/types.cc
@@ -24,6 +24,23 @@ bool DataTypeCanUseMemcpy(DataType dt) {
   }
 }
 
+std::string DataTypeToString(const DataType dt) {
+  static std::map<DataType, std::string> dtype_string_map = {
+      {DT_FLOAT, "DT_FLOAT"},
+      {DT_HALF, "DT_HALF"},
+      {DT_DOUBLE, "DT_DOUBLE"},
+      {DT_UINT8, "DT_UINT8"},
+      {DT_INT8, "DT_INT8"},
+      {DT_INT32, "DT_INT32"},
+      {DT_UINT32, "DT_UINT32"},
+      {DT_UINT16, "DT_UINT16"},
+      {DT_INT64, "DT_INT64"},
+      {DT_BOOL, "DT_BOOL"},
+      {DT_STRING, "DT_STRING"}
+  };
+  MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type";
+  return dtype_string_map[dt];
+}
 
 size_t GetEnumTypeSize(const DataType dt) {
   switch (dt) {
diff --git a/mace/core/types.h b/mace/core/types.h
index 1fb6c805d3251fe058cbecc7f93b9d771e8a05e9..616e40b2aeba81a1eca0ddbe28b7acf4c56b2b0a 100644
--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -18,6 +18,8 @@ bool DataTypeCanUseMemcpy(DataType dt);
 
 size_t GetEnumTypeSize(const DataType dt);
 
+std::string DataTypeToString(const DataType dt);
+
 template <class T>
 struct IsValidDataType;
 
diff --git a/mace/dsp/BUILD b/mace/dsp/BUILD
index ca0183822ff2c6313bf8b1f8faa112becb3ef4b3..cbe9f834a40dba000bf7da74320dae422bdf99e1 100644
--- a/mace/dsp/BUILD
+++ b/mace/dsp/BUILD
@@ -24,7 +24,7 @@ cc_library(
         "*.h",
         "hexagon/*.h",
     ]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
     deps = [
         "//mace/proto:cc_proto",
         "//mace/core:core",
@@ -36,7 +36,7 @@ cc_test(
     name = "dsp_test",
     testonly = 1,
     srcs = glob(["*_test.cc"]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
     linkopts = if_android([
         "-ldl",
         "-lm",
@@ -52,7 +52,7 @@ cc_test(
     name = "dsp_op_test",
     testonly = 1,
     srcs = glob(["test/*_test.cc"]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
     linkopts = if_android([
         "-ldl",
         "-lm",
@@ -64,3 +64,21 @@ cc_test(
         "//mace/kernels:kernels",
     ],
 )
+
+cc_binary(
+    name = "mace_dsp_run",
+    srcs = [
+        "tool/mace_dsp_run.cc",
+    ],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
+    linkopts = if_android([
+        "-ldl",
+        "-lm",
+    ]),
+    linkstatic = 1,
+    deps = [
+        ":dsp",
+        "//mace/kernels:kernels",
+        "//mace/utils:command_line_flags",
+    ],
+)
\ No newline at end of file
diff --git a/mace/dsp/hexagon_control_wrapper.cc b/mace/dsp/hexagon_control_wrapper.cc
index 7c65e7e5212f0797e847b22fd67640bb58854f41..3f25a5d78d208d3d10abbd18ae6234ae2033d2ed 100644
--- a/mace/dsp/hexagon_control_wrapper.cc
+++ b/mace/dsp/hexagon_control_wrapper.cc
@@ -111,22 +111,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef& net_def) {
   }
 
   // input info
-  const InputInfo& input_info = net_def.input_info()[0];
-  input_shape_.insert(input_shape_.begin(),
-                      input_info.dims().begin(), input_info.dims().end());
-  while (input_shape_.size() < 4) {
-    input_shape_.insert(input_shape_.begin(), 1);
+  num_inputs_ = 0;
+  for (const InputInfo &input_info: net_def.input_info()) {
+    vector<index_t> input_shape;
+    input_shape.insert(input_shape.begin(),
+                       input_info.dims().begin(), input_info.dims().end());
+    while (input_shape.size() < 4) {
+      input_shape.insert(input_shape.begin(), 1);
+    }
+    input_shapes_.push_back(input_shape);
+    input_data_types_.push_back(input_info.data_type());
+    num_inputs_ += 1;
   }
-  input_data_type_ = input_info.data_type();
 
   // output info
-  const OutputInfo& output_info = net_def.output_info()[0];
-  output_shape_.insert(output_shape_.begin(),
-                       output_info.dims().begin(), output_info.dims().end());
-  while (output_shape_.size() < 4) {
-    output_shape_.insert(output_shape_.begin(), 1);
+  num_outputs_ = 0;
+  for (const OutputInfo &output_info: net_def.output_info()) {
+    vector<index_t> output_shape;
+    output_shape.insert(output_shape.begin(),
+                        output_info.dims().begin(), output_info.dims().end());
+    while (output_shape.size() < 4) {
+      output_shape.insert(output_shape.begin(), 1);
+    }
+    output_shapes_.push_back(output_shape);
+    output_data_types_.push_back(output_info.data_type());
+    num_outputs_ += 1;
   }
-  output_data_type_ = output_info.data_type();
 
   bool res =  hexagon_nn_prepare(nn_id_) == 0;
   return res;
@@ -218,4 +228,111 @@ void HexagonControlWrapper::ResetPerfInfo() {
   hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME);
 }
 
+bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
+                                         Tensor *output_tensor) {
+  LOG(INFO) << "Execute graph: " << nn_id_;
+  // single input and single output
+  MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num");
+  MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
+  output_tensor->SetDtype(output_data_types_[0]);
+  output_tensor->Resize(output_shapes_[0]);
+  vector<uint32_t> output_shape(4);
+  uint32_t output_bytes;
+  int res = hexagon_nn_execute(nn_id_,
+                               input_tensor.shape()[0],
+                               input_tensor.shape()[1],
+                               input_tensor.shape()[2],
+                               input_tensor.shape()[3],
+                               reinterpret_cast<const unsigned char *>(
+                                   input_tensor.raw_data()),
+                               input_tensor.raw_size(),
+                               &output_shape[0],
+                               &output_shape[1],
+                               &output_shape[2],
+                               &output_shape[3],
+                               reinterpret_cast<unsigned char *>(
+                                   output_tensor->raw_mutable_data()),
+                               output_tensor->raw_size(),
+                               &output_bytes);
+
+  MACE_ASSERT(output_shape == output_shapes_[0],
+              "wrong output shape inferred");
+  MACE_ASSERT(output_bytes == output_tensor->raw_size(),
+              "wrong output bytes inferred.");
+  return res == 0;
+};
+
+bool HexagonControlWrapper::ExecuteGraphNew(const vector<Tensor> &input_tensors,
+                                            vector<Tensor> *output_tensors) {
+  LOG(INFO) << "Execute graph new: " << nn_id_;
+  int num_inputs = input_tensors.size();
+  int num_outputs = output_tensors->size();
+  MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
+  MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
+
+  hexagon_nn_tensordef *inputs = new hexagon_nn_tensordef[num_inputs];
+  hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs];
+
+  for (int i = 0; i < num_inputs; ++i) {
+    vector<index_t> input_shape = input_tensors[i].shape();
+    inputs[i].batches = input_shape[0];
+    inputs[i].height = input_shape[1];
+    inputs[i].width = input_shape[2];
+    inputs[i].depth = input_shape[3];
+    inputs[i].data = const_cast<unsigned char *>(
+        reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
+    inputs[i].dataLen = input_tensors[i].raw_size();
+    inputs[i].data_valid_len = input_tensors[i].raw_size();
+    inputs[i].unused = 0;
+  }
+
+  for (int i = 0; i < num_outputs; ++i) {
+    (*output_tensors)[i].SetDtype(output_data_types_[i]);
+    (*output_tensors)[i].Resize(output_shapes_[i]);
+    outputs[i].data = reinterpret_cast<unsigned char *>(
+        (*output_tensors)[i].raw_mutable_data());
+    outputs[i].dataLen = (*output_tensors)[i].raw_size();
+  }
+
+  int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
+                                   outputs, num_outputs);
+
+  for (int i = 0; i < num_outputs; ++i) {
+    vector<uint32_t> output_shape {outputs[i].batches, outputs[i].height,
+                                   outputs[i].width, outputs[i].depth};
+    MACE_ASSERT(output_shape  == output_shapes_[i],
+                "wrong output shape inferred");
+    MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(),
+                "wrong output bytes inferred.");
+  }
+
+  delete [] inputs;
+  delete [] outputs;
+  return res == 0;
+};
+
+bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
+                                                    Tensor *output_tensor) {
+  vector<Tensor> input_tensors(3);
+  vector<Tensor> output_tensors(3);
+  input_tensors[0].SetDtype(DT_UINT8);
+  output_tensors[0].SetDtype(DT_UINT8);
+  input_tensors[0].ResizeLike(input_tensor);
+  input_tensors[1].Resize({1, 1, 1, 1});
+  float *min_in_data = input_tensors[1].mutable_data<float>();
+  input_tensors[2].Resize({1, 1, 1, 1});
+  float *max_in_data = input_tensors[2].mutable_data<float>();
+  quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data, max_in_data);
+  if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
+    return false;
+  }
+
+  output_tensor->ResizeLike(output_tensors[0]);
+
+  const float *min_out_data = output_tensors[1].data<float>();
+  const float *max_out_data = output_tensors[2].data<float>();
+  quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data, output_tensor);
+  return true;
+}
+
 } // namespace mace
\ No newline at end of file
diff --git a/mace/dsp/hexagon_control_wrapper.h b/mace/dsp/hexagon_control_wrapper.h
index a67e9903b7a42f6866fe1e1a63177586bfdfd326..fa9f47b1bc7fbbb08e58b70ab15f5cb8e884f847 100644
--- a/mace/dsp/hexagon_control_wrapper.h
+++ b/mace/dsp/hexagon_control_wrapper.h
@@ -7,6 +7,7 @@
 
 #include "mace/dsp/hexagon/hexagon_controller.h"
 #include "mace/dsp/hexagon_nn_ops.h"
+#include "mace/dsp/util/quantize.h"
 #include "mace/core/common.h"
 #include "mace/core/tensor.h"
 #include "mace/proto/mace.pb.h"
@@ -23,35 +24,10 @@ class HexagonControlWrapper {
   bool Finalize();
   bool SetupGraph(const NetDef& net_def);
   bool SetupGraph(const std::string &model_file);
-  bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor) {
-    LOG(INFO) << "Execute graph: " << nn_id_;
-    output_tensor->SetDtype(output_data_type_);
-    output_tensor->Resize(output_shape_);
-    vector<uint32_t> output_shape(4);
-    uint32_t output_bytes;
-    int res = hexagon_nn_execute(nn_id_,
-                                 input_tensor.shape()[0],
-                                 input_tensor.shape()[1],
-                                 input_tensor.shape()[2],
-                                 input_tensor.shape()[3],
-                                 reinterpret_cast<const unsigned char *>(
-                                     input_tensor.raw_data()),
-                                 input_tensor.raw_size(),
-                                 &output_shape[0],
-                                 &output_shape[1],
-                                 &output_shape[2],
-                                 &output_shape[3],
-                                 reinterpret_cast<unsigned char *>(
-                                     output_tensor->raw_mutable_data()),
-                                 output_tensor->raw_size(),
-                                 &output_bytes);
-
-    MACE_ASSERT(output_shape == output_shape_,
-                "wrong output shape inferred");
-    MACE_ASSERT(output_bytes == output_tensor->raw_size(),
-                "wrong output bytes inferred.");
-    return res == 0;
-  };
+  bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
+  bool ExecuteGraphNew(const vector<Tensor>& input_tensors,
+                       vector<Tensor> *output_tensors);
+  bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor);
 
   bool TeardownGraph();
   void PrintLog();
@@ -70,11 +46,14 @@ class HexagonControlWrapper {
 
   int nn_id_;
   Serializer serializer_;
-
-  vector<index_t> input_shape_;
-  vector<index_t> output_shape_;
-  DataType input_data_type_;
-  DataType output_data_type_;
+  Quantizer quantizer_;
+
+  vector<vector<index_t>> input_shapes_;
+  vector<vector<index_t>> output_shapes_;
+  vector<DataType> input_data_types_;
+  vector<DataType> output_data_types_;
+  uint32_t num_inputs_;
+  uint32_t num_outputs_;
 
  DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
 };
diff --git a/mace/dsp/hexagon_control_wrapper_test.cc b/mace/dsp/hexagon_control_wrapper_test.cc
index 48a743c69ecdb09bb09ca95412fe8852a86a55eb..b34e028c16b80fdfe9c280a3edf353fa9e040ec6 100644
--- a/mace/dsp/hexagon_control_wrapper_test.cc
+++ b/mace/dsp/hexagon_control_wrapper_test.cc
@@ -8,7 +8,7 @@
 
 using namespace mace;
 
-TEST(HexagonControlerWrapper, GetVersion) {
+TEST(HexagonControlerWrapper, InputFloat) {
   testing::internal::LogToStderr();
   HexagonControlWrapper wrapper;
   VLOG(0) << "version: " << wrapper.GetVersion();
@@ -29,7 +29,7 @@ TEST(HexagonControlerWrapper, GetVersion) {
   wrapper.ResetPerfInfo();
   timeval tv1, tv2;
   gettimeofday(&tv1, NULL);
-  int round = 2;
+  int round = 10;
   for (int i = 0; i < round; ++i) {
     VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
   }
@@ -49,6 +49,50 @@ TEST(HexagonControlerWrapper, GetVersion) {
   }
   std::cout << std::endl;
 
+  VLOG(0) << wrapper.TeardownGraph();
+  wrapper.Finalize();
+}
+
+TEST(HexagonControlerWrapper, PreQuantize) {
+  testing::internal::LogToStderr();
+  HexagonControlWrapper wrapper;
+  VLOG(0) << "version: " << wrapper.GetVersion();
+  wrapper.Init();
+  wrapper.SetDebugLevel(0);
+  wrapper.Config();
+  VLOG(0) << wrapper.SetupGraph("quantized_icnet_dsp_u8.pb");
+  wrapper.PrintGraph();
+
+  Tensor input_tensor;
+  Tensor output_tensor;
+  input_tensor.Resize({1, 480, 480, 3});
+  float *input_data = input_tensor.mutable_data<float>();
+  for (int i = 0; i < input_tensor.size(); ++i) {
+    input_data[i] = i % 256;
+  }
+
+  wrapper.ResetPerfInfo();
+  timeval tv1, tv2;
+  gettimeofday(&tv1, NULL);
+  int round = 10;
+  for (int i = 0; i < round; ++i) {
+    VLOG(0) << wrapper.ExecuteGraphPreQuantize(input_tensor, &output_tensor);
+  }
+  gettimeofday(&tv2, NULL);
+  VLOG(0) << "avg duration: "
+          << ((tv2.tv_sec - tv1.tv_sec) * 1000 +
+              (tv2.tv_usec - tv1.tv_usec) / 1000) /
+              round;
+
+  wrapper.GetPerfInfo();
+  wrapper.PrintLog();
+
+  const float *output_data = output_tensor.data<float>();
+  for (int i = 0; i < output_tensor.size(); ++i) {
+    std::cout << output_data[i] << " ";
+  }
+  std::cout << std::endl;
+
   VLOG(0) << wrapper.TeardownGraph();
   wrapper.Finalize();
 }
\ No newline at end of file
diff --git a/mace/dsp/test/quantized_resize_bilinear_test.cc b/mace/dsp/test/quantized_resize_bilinear_test.cc
index a1f26abad75d210e546817487123cf76b646532c..12a2f8d34b94aeb21c8d3507be4ab4b545c26c2e 100644
--- a/mace/dsp/test/quantized_resize_bilinear_test.cc
+++ b/mace/dsp/test/quantized_resize_bilinear_test.cc
@@ -5,6 +5,7 @@
 #include "mace/dsp/hexagon_control_wrapper.h"
 #include "gtest/gtest.h"
 
+#define RESIZE_BILINEAR_TEST_CHANNELS 128
 using namespace mace;
 
 static NetDef BuildNetDef() {
@@ -17,7 +18,7 @@ static NetDef BuildNetDef() {
   input_op->set_type("INPUT");
   input_op->set_node_id(0);
   input_op->set_padding(0);
-  input_op->add_out_max_byte_size(1000);
+  input_op->add_out_max_byte_size(1200);
 
   // relu op
   OperatorDef *resize_bilinear_op = net.add_op();
@@ -45,7 +46,7 @@ static NetDef BuildNetDef() {
   input_node_input = resize_bilinear_op->add_node_input();
   input_node_input->set_node_id(12);
   input_node_input->set_output_port(0);
-  resize_bilinear_op->add_out_max_byte_size(1000);
+  resize_bilinear_op->add_out_max_byte_size(1200);
   resize_bilinear_op->add_out_max_byte_size(1000);
   resize_bilinear_op->add_out_max_byte_size(1000);
 
@@ -64,8 +65,8 @@ static NetDef BuildNetDef() {
   new_dim_tensor->add_dims(2);
   new_dim_tensor->set_data_type(DataType::DT_INT32);
   new_dim_tensor->set_node_id(10);
-  new_dim_tensor->add_int32_data(1);
-  new_dim_tensor->add_int32_data(1);
+  new_dim_tensor->add_int32_data(2);
+  new_dim_tensor->add_int32_data(2);
 
   TensorProto *input_min_tensor = net.add_tensors();
   input_min_tensor->set_name("input_min");
@@ -86,20 +87,20 @@ static NetDef BuildNetDef() {
   input_info->set_name("input_node");
   input_info->set_node_id(0);
   input_info->add_dims(1);
-  input_info->add_dims(2);
-  input_info->add_dims(2);
-  input_info->add_dims(128);
+  input_info->add_dims(3);
+  input_info->add_dims(3);
+  input_info->add_dims(RESIZE_BILINEAR_TEST_CHANNELS);
   input_info->set_data_type(DataType::DT_UINT8);
-  input_info->set_max_byte_size(1000);
+  input_info->set_max_byte_size(1200);
   OutputInfo *output_info = net.add_output_info();
   output_info->set_name("output_node");
   output_info->set_node_id(1);
   output_info->add_dims(1);
-  output_info->add_dims(1);
-  output_info->add_dims(1);
-  output_info->add_dims(128);
+  output_info->add_dims(2);
+  output_info->add_dims(2);
+  output_info->add_dims(RESIZE_BILINEAR_TEST_CHANNELS);
   output_info->set_data_type(DataType::DT_UINT8);
-  output_info->set_max_byte_size(1000);
+  output_info->set_max_byte_size(1200);
 
   return net;
 }
@@ -117,21 +118,25 @@ TEST(QuantizedResizeBilinearTest, QuantizedResizeBilinear) {
   Allocator *cpu_allocator = GetDeviceAllocator(DeviceType::CPU);
   Tensor input_tensor(cpu_allocator, DT_UINT8);
   Tensor output_tensor(cpu_allocator, DT_UINT8);
-  input_tensor.Resize({1, 2, 2, 128});
-  output_tensor.Resize({1, 1, 1, 128});
+  input_tensor.Resize({1, 3, 3, RESIZE_BILINEAR_TEST_CHANNELS});
+  output_tensor.Resize({1, 2, 2, RESIZE_BILINEAR_TEST_CHANNELS});
   uint8_t *input_data = input_tensor.mutable_data<uint8_t>();
   const uint8_t *output_data = output_tensor.data<uint8_t>();
 
-  for (int c = 0; c < 128; ++c) {
-    input_data[c] = input_data[c + 128] = input_data[c + 256]
-        = input_data[c + 384] = (uint8_t)c;
+  for (int wh = 0; wh < 9; ++wh) {
+    for (int c = 0; c < RESIZE_BILINEAR_TEST_CHANNELS; ++c) {
+      input_data[wh * RESIZE_BILINEAR_TEST_CHANNELS + c] = 9 - wh;
+    }
   }
 
   VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
   wrapper.PrintLog();
 
-  for (int i = 0; i < output_tensor.size(); ++i) {
-    EXPECT_EQ(i, output_data[i]);
+  vector<uint8_t> expected {9, 8, 5, 3};
+  for (int i = 0; i < 4; ++i) {
+    for (int c = 0; c < RESIZE_BILINEAR_TEST_CHANNELS; ++c)
+      EXPECT_EQ(expected[i],
+                output_data[i * RESIZE_BILINEAR_TEST_CHANNELS + c]);
   }
   std::cout << std::endl;
 
diff --git a/mace/dsp/tool/mace_dsp_run.cc b/mace/dsp/tool/mace_dsp_run.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c8e7afae7acfccec8418b4b63da75d0a6d47af4
--- /dev/null
+++ b/mace/dsp/tool/mace_dsp_run.cc
@@ -0,0 +1,109 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+/**
+ * Usage:
+ * mace_dsp_run --model=mobi_mace.pb \
+ *          --input_shape=1,3,224,224   \
+ *          --input_file=input_data \
+ *          --output_file=mace.out
+ */
+#include <sys/time.h>
+#include <fstream>
+#include "mace/dsp/hexagon_control_wrapper.h"
+#include "mace/core/net.h"
+#include "mace/utils/command_line_flags.h"
+
+using namespace std;
+using namespace mace;
+
+void ParseShape(const string &str, vector<index_t> *shape) {
+  string tmp = str;
+  while (!tmp.empty()) {
+    int dim = atoi(tmp.data());
+    shape->push_back(dim);
+    size_t next_offset = tmp.find(",");
+    if (next_offset == string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  string model_file;
+  string input_shape;
+  string input_file;
+  string output_file;
+  int round = 1;
+
+  std::vector<Flag> flag_list = {
+      Flag("model", &model_file, "model file name"),
+      Flag("input_shape", &input_shape, "input shape, separated by comma"),
+      Flag("input_file", &input_file, "input file name"),
+      Flag("output_file", &output_file, "output file name"),
+      Flag("round", &round, "round"),
+  };
+
+  string usage = Flags::Usage(argv[0], flag_list);
+  const bool parse_result = Flags::Parse(&argc, argv, flag_list);
+
+  if (!parse_result) {
+    LOG(ERROR) << usage;
+    return -1;
+  }
+
+  VLOG(0) << "model: " << model_file << std::endl
+          << "input_shape: " << input_shape << std::endl
+          << "input_file: " << input_file << std::endl
+          << "output_file: " << output_file << std::endl
+          << "round: " << round << std::endl;
+
+  vector<index_t> shape;
+  ParseShape(input_shape, &shape);
+
+  // load input
+  Tensor input_tensor;
+  input_tensor.Resize(shape);
+  float *input_data = input_tensor.mutable_data<float>();
+  ifstream in_file(input_file, ios::in | ios::binary);
+  in_file.read(reinterpret_cast<char *>(input_data),
+               input_tensor.size() * sizeof(float));
+  in_file.close();
+
+  // execute
+  HexagonControlWrapper wrapper;
+  VLOG(0) << "version: " << wrapper.GetVersion();
+  wrapper.Init();
+  wrapper.SetDebugLevel(0);
+  wrapper.Config();
+  VLOG(0) << wrapper.SetupGraph(model_file);
+  wrapper.PrintGraph();
+
+  Tensor output_tensor;
+  timeval tv1, tv2;
+  gettimeofday(&tv1, NULL);
+  for (int i = 0; i < round; ++i) {
+    VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
+  }
+  gettimeofday(&tv2, NULL);
+  cout << "avg duration: "
+       << ((tv2.tv_sec - tv1.tv_sec) * 1000 +
+           (tv2.tv_usec - tv1.tv_usec) / 1000) /
+           round
+       << endl;
+
+  wrapper.GetPerfInfo();
+  wrapper.PrintLog();
+  VLOG(0) << wrapper.TeardownGraph();
+  wrapper.Finalize();
+
+  // save output
+  ofstream out_file(output_file, ios::binary);
+  out_file.write((const char *) (output_tensor.data<float>()),
+                 output_tensor.size() * sizeof(float));
+  out_file.flush();
+  out_file.close();
+}
\ No newline at end of file
diff --git a/mace/dsp/util/BUILD b/mace/dsp/util/BUILD
index 4a75e104fccca2214cd0ffbf014a8c224614d9f4..e5730b285116454ca7c15d5dd08110d3da7c3f42 100644
--- a/mace/dsp/util/BUILD
+++ b/mace/dsp/util/BUILD
@@ -20,7 +20,7 @@ cc_library(
     hdrs = glob([
         "*.h",
     ]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
     deps = [
         "//mace/core:core",
     ],
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index b47ef7e73f83a780fd4baf5aa729e980732da7ed..6195f324da7731cf2a7374ded017e734ce92faf8 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -10,15 +10,23 @@
 namespace mace {
 namespace kernels {
 
-template<DeviceType D, typename T>
-struct AddNFunctor {
-  void operator()(std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
+struct AddNFunctorBase {};
+
+template <DeviceType D, typename T>
+struct AddNFunctor : AddNFunctorBase {
+  void operator()(const std::vector<const Tensor *> &input_tensors,
+                  Tensor *output_tensor) {
+    output_tensor->ResizeLike(input_tensors[0]);
     Tensor::MappingGuard output_map(output_tensor);
     index_t size = input_tensors[0]->size();
     T *output_ptr = output_tensor->mutable_data<T>();
     memset(output_ptr, 0, size * sizeof(T));
     int n = input_tensors.size();
     for (int i = 0; i < n; ++i) {
+      MACE_CHECK(input_tensors[i]->dim(0) == output_tensor->dim(0));
+      MACE_CHECK(input_tensors[i]->dim(1) == output_tensor->dim(1));
+      MACE_CHECK(input_tensors[i]->dim(2) == output_tensor->dim(2));
+      MACE_CHECK(input_tensors[i]->dim(3) == output_tensor->dim(3));
       Tensor::MappingGuard input_map(input_tensors[i]);
       const T *input_ptr = input_tensors[i]->data<T>();
       for (index_t j = 0; j < size; ++j) {
@@ -28,15 +36,17 @@ struct AddNFunctor {
   }
 };
 
-template<>
+template <>
 void AddNFunctor<DeviceType::NEON, float>::operator()(
-    std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
 
-template<>
-void AddNFunctor<DeviceType::OPENCL, float>::operator()(
-    std::vector<const Tensor *> &inputs, Tensor *output);
+template <typename T>
+struct AddNFunctor<DeviceType::OPENCL, T> : AddNFunctorBase {
+  void operator()(const std::vector<const Tensor *> &input_tensors,
+                  Tensor *output_tensor);
+};
 
 }  //  namespace kernels
 }  //  namespace mace
 
-#endif  // MACE_KERNELS_ADDN_H_
\ No newline at end of file
+#endif  // MACE_KERNELS_ADDN_H_
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index b95d4895bc3963493ef55eb31e776aa4ca732dc0..36b2925742ce6214d3d4d41146221750f47a35b2 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -28,9 +28,10 @@ struct BatchNormFunctor {
     // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
     // new_offset = \offset - mean * common_val;
     // Y = new_scale * X + new_offset;
-    const index_t n = input->dim(0);
-    const index_t channel = input->dim(1);
-    const index_t sample_size = input->dim(2) * input->dim(3);
+    const index_t batch = input->dim(0);
+    const index_t height = input->dim(1);
+    const index_t width = input->dim(2);
+    const index_t channels = input->dim(3);
 
     Tensor::MappingGuard input_mapper(input);
     Tensor::MappingGuard scale_mapper(scale);
@@ -48,19 +49,26 @@ struct BatchNormFunctor {
     const T *epsilon_ptr = epsilon->data<T>();
     T *output_ptr = output->mutable_data<T>();
 
+    vector<T> new_scale(channels);
+    vector<T> new_offset(channels);
+
 #pragma omp parallel for
-    for (index_t c = 0; c < channel; ++c) {
-      T new_scale = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr);
-      T new_offset = offset_ptr[c] - mean_ptr[c] * new_scale;
-      index_t pos = c * sample_size;
+    for (index_t c = 0; c < channels; ++c) {
+      new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr);
+      new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
+    }
+
+    index_t pos = 0;
 
-      for (index_t i = 0; i < n; ++i) {
-        const T *input_sample_ptr = input_ptr + pos;
-        T *output_sample_ptr = output_ptr + pos;
-        for (index_t j = 0; j < sample_size; ++j) {
-          output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset;
+#pragma omp parallel for
+    for (index_t n = 0; n < batch; ++n) {
+      for (index_t h = 0; h < height; ++h) {
+        for (index_t w = 0; w < width; ++w) {
+          for (index_t c = 0; c < channels; ++c) {
+            output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
+            ++pos;
+          }
         }
-        pos += channel * sample_size;
       }
     }
   }
@@ -76,15 +84,16 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
     const Tensor *epsilon,
     Tensor *output);
 
-template <>
-void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
-      const Tensor *input,
-      const Tensor *scale,
-      const Tensor *offset,
-      const Tensor *mean,
-      const Tensor *var,
-      const Tensor *epsilon,
-      Tensor *output);
+template <typename T>
+struct BatchNormFunctor<DeviceType::OPENCL, T> {
+  void operator()(const Tensor *input,
+                  const Tensor *scale,
+                  const Tensor *offset,
+                  const Tensor *mean,
+                  const Tensor *var,
+                  const Tensor *epsilon,
+                  Tensor *output);
+};
 
 }  //  namepsace kernels
 }  //  namespace mace
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index a717c6a48513eb075ae4b36124213a109a7f4786..e9a41cfcafef011da308a4df81b3dbc79874bfb2 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -11,13 +11,23 @@
 namespace mace {
 namespace kernels {
 
+struct Conv2dFunctorBase {
+  Conv2dFunctorBase(const int *strides,
+                    const Padding &paddings,
+                    const int *dilations)
+      : strides_(strides), dilations_(dilations), paddings_(paddings) {}
+
+  const int *strides_;         // [stride_h, stride_w]
+  const int *dilations_;       // [dilation_h, dilation_w]
+  Padding paddings_;
+};
+
 template<DeviceType D, typename T>
-struct Conv2dFunctor {
-  Conv2dFunctor() {}
+struct Conv2dFunctor : Conv2dFunctorBase {
   Conv2dFunctor(const int *strides,
                 const Padding &paddings,
                 const int *dilations)
-      : strides_(strides), dilations_(dilations), paddings_(paddings) {}
+      : Conv2dFunctorBase(strides, paddings, dilations) {}
 
   void operator()(const Tensor *input,
                   const Tensor *filter,
@@ -76,9 +86,10 @@ struct Conv2dFunctor {
       for (int h = 0; h < height; ++h) {
         for (int w = 0; w < width; ++w) {
           for (int c = 0; c < channels; ++c) {
-            T bias_channel = bias_data ? bias_data[c] : 0;
+            T bias_channel = 0.0f;
+            if (bias) bias_channel = bias_data[c];
             *output_data = bias_channel;
-            T sum = 0;
+            T sum = 0.0f;
             const T *filter_ptr = filter_data + c;
             for (int kh = 0; kh < kernel_h; ++kh) {
               for (int kw = 0; kw < kernel_w; ++kw) {
@@ -113,9 +124,6 @@ struct Conv2dFunctor {
 
   }
 
-  const int *strides_;         // [stride_h, stride_w]
-  const int *dilations_;       // [dilation_h, dilation_w]
-  Padding paddings_;
 };
 
 template<>
@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                                                         const Tensor *filter,
                                                         const Tensor *bias,
                                                         Tensor *output);
-template<>
-void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
-                                                          const Tensor *filter,
-                                                          const Tensor *bias,
-                                                          Tensor *output);
+
+template<typename T>
+struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
+  Conv2dFunctor(const int *strides,
+                const Padding &paddings,
+                const int *dilations)
+      : Conv2dFunctorBase(strides, paddings, dilations) {}
+
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  Tensor *output);
+};
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/fused_conv_2d.h b/mace/kernels/fused_conv_2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..4daf28e63599497ea5af99ae7ef1a452dd838465
--- /dev/null
+++ b/mace/kernels/fused_conv_2d.h
@@ -0,0 +1,71 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_KERNELS_FUSED_CONV_2D_H_
+#define MACE_KERNELS_FUSED_CONV_2D_H_
+
+#include "mace/core/tensor.h"
+#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/kernels/conv_2d.h"
+
+namespace mace {
+namespace kernels {
+
+struct FusedConv2dFunctorBase {
+  FusedConv2dFunctorBase(const int *strides,
+                         const Padding &paddings,
+                         const int *dilations)
+      : strides_(strides), dilations_(dilations), paddings_(paddings) {}
+
+  const int *strides_;         // [stride_h, stride_w]
+  const int *dilations_;       // [dilation_h, dilation_w]
+  Padding paddings_;
+};
+
+template<DeviceType D, typename T>
+struct FusedConv2dFunctor : FusedConv2dFunctorBase {
+  FusedConv2dFunctor(const int *strides,
+                     const Padding &paddings,
+                     const int *dilations)
+      : FusedConv2dFunctorBase(strides, paddings, dilations) {}
+
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  Tensor *output) {
+    Conv2dFunctor<D, T>(strides_, paddings_, dilations_)(input, filter, bias, output);
+    T *output_data = output->mutable_data<T>();
+
+    T zero_value;
+    if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
+      zero_value = half_float::half_cast<half>(0.0f);
+    } else {
+      zero_value = 0;
+    }
+    auto output_size = output->size();
+    for (int n = 0; n < output_size; ++n) {
+      *output_data = *output_data < 0 ? zero_value : *output_data;
+      output_data++;
+    }
+  }
+
+};
+
+template<typename T>
+struct FusedConv2dFunctor<DeviceType::OPENCL, T> : FusedConv2dFunctorBase {
+  FusedConv2dFunctor(const int *strides,
+                     const Padding &paddings,
+                     const int *dilations)
+      : FusedConv2dFunctorBase(strides, paddings, dilations) {}
+
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  Tensor *output);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_FUSED_CONV_2D_H_
diff --git a/mace/kernels/neon/addn_neon.cc b/mace/kernels/neon/addn_neon.cc
index d7ff94864ea3ba7469cea561558e39b41624db1f..33a2bec5bdfecb985dec1f20d3a0b01f2a245fd2 100644
--- a/mace/kernels/neon/addn_neon.cc
+++ b/mace/kernels/neon/addn_neon.cc
@@ -10,7 +10,7 @@ namespace kernels {
 
 template <>
 void AddNFunctor<DeviceType::NEON, float>::operator()(
-    std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
   // TODO: neon mem copy
   index_t size = output_tensor->size();
   float *output_ptr = output_tensor->mutable_data<float>();
@@ -51,4 +51,4 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
 };
 
 }  // namespace kernels
-}  // namespace mace
\ No newline at end of file
+}  // namespace mace
diff --git a/mace/kernels/neon/pooling_neon.cc b/mace/kernels/neon/pooling_neon.cc
index 0f9162349c03398c5301b152a9c87048431caa5f..76868335d12500623cc08fff5d0cfae70761cff9 100644
--- a/mace/kernels/neon/pooling_neon.cc
+++ b/mace/kernels/neon/pooling_neon.cc
@@ -58,19 +58,27 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
     const Tensor *input_tensor,
     Tensor *output_tensor) {
 
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  std::vector<index_t> filter_shape(4);
+  filter_shape[0] = input_tensor->shape()[1];
+  filter_shape[1] = input_tensor->shape()[1];
+  filter_shape[2] = kernels_[0];
+  filter_shape[3] = kernels_[1];
+
+  kernels::CalcPaddingAndOutputSize(
+      input_tensor->shape().data(), filter_shape.data(), this->dilations_,
+      strides_, this->padding_, output_shape.data(),
+      paddings.data());
+  output_tensor->Resize(output_shape);
+
   const float *input = input_tensor->data<float>();
   float *output = output_tensor->mutable_data<float>();
   const index_t *input_shape = input_tensor->shape().data();
-  const index_t *output_shape = output_tensor->shape().data();
 
-  int paddings[2];
-  std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
-                                       kernels_[0], kernels_[1]};
-  kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
-                          strides_, this->padding_, paddings);
 #ifdef __COPY_MAKE_PADDING
   Tensor padded_input;
-  ConstructInputWithPadding(input_tensor, paddings, &padded_input);
+  ConstructInputWithPadding(input_tensor, paddings.data(), &padded_input);
   input = padded_input.data<float>();
   input_shape = padded_input.shape().data();
 #endif
@@ -80,17 +88,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
     // kernel_size: 2x2, strides: 2x2
     if (pooling_type_ == MAX) {  // MAX_POOL_2x2s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
+      PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape,
-                             paddings);
+      PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
+                             paddings.data());
 #endif
     } else {  // AVG_POOL_2x2s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
+      PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape,
-                             paddings);
+      PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
+                             paddings.data());
 #endif
     }
   } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 &&
@@ -98,17 +106,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
     // kernel_size: 3x3, strides: 2x2
     if (pooling_type_ == MAX) {  // MAX_POOL_3x3s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
+      PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape,
-                             paddings);
+      PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
+                             paddings.data());
 #endif
     } else {  // AVG_POOL_3x3s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
+      PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape,
-                             paddings);
+      PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
+                             paddings.data());
 #endif
     }
   } else {  // not implement yet
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 6c5106db25c0f12cb625b6e5e0c80c0497541804..31cd19104f43082e10fa4fdef77e6d02ceeb67cd 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -5,52 +5,83 @@
 #include "mace/kernels/addn.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 namespace kernels {
 
-static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
-  index_t element_size = input0->NumElements();
-  index_t blocks = (element_size + 3) / 4;
+template <typename T>
+static void AddN(const std::vector<const Tensor *> &input_tensors,
+                 Tensor *output) {
+  if (input_tensors.size() > 4) {
+    MACE_NOT_IMPLEMENTED;
+  }
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
 
-  const uint32_t gws = blocks;
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_pixels = channel_blocks * width;
+  const index_t batch_height_pixels = batch * height;
 
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(output->dtype()));
-  auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options);
+  auto dt = DataTypeToEnum<T>::value;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
+  auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options);
 
   const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
 
   uint32_t idx = 0;
-  addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input0->buffer())));
-  addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input1->buffer())));
-  addn_kernel.setArg(idx++, static_cast<int32_t>(element_size));
-  addn_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  for (auto input : input_tensors) {
+  addn_kernel.setArg(idx++,
+                     *(static_cast<const cl::Image2D *>(input->buffer())));
+  }
+  addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
 
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
       addn_kernel, cl::NullRange,
-      cl::NDRange(gws),
-      cl::NDRange(lws),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
+      cl::NDRange(width_pixels, batch_height_pixels),
+      cl::NDRange(64, 16),  // TODO fix this
+      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
+  MACE_CHECK(error == CL_SUCCESS) << "error code: " << error;
 }
 
-template<>
-void AddNFunctor<DeviceType::OPENCL, float>::operator()(std::vector<const Tensor *> &input_tensors,
-                                                        Tensor *output_tensor) {
-
-  if (input_tensors.empty() || input_tensors.front() == nullptr) {
-    return;
-  }
+template <typename T>
+void AddNFunctor<DeviceType::OPENCL, T>::operator()(
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
   size_t size = input_tensors.size();
+  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
+
+  const index_t batch = input_tensors[0]->dim(0);
+  const index_t height = input_tensors[0]->dim(1);
+  const index_t width = input_tensors[0]->dim(2);
+  const index_t channels = input_tensors[0]->dim(3);
 
-  switch (size) {
-    case 2:Add2(input_tensors[0], input_tensors[1], output_tensor);
-      break;
-    default:MACE_NOT_IMPLEMENTED;
+  for (int i = 1; i < size; ++i) {
+    MACE_CHECK_NOTNULL(input_tensors[i]);
+    MACE_CHECK(batch == input_tensors[i]->dim(0));
+    MACE_CHECK(height == input_tensors[i]->dim(1));
+    MACE_CHECK(width == input_tensors[i]->dim(2));
+    MACE_CHECK(channels == input_tensors[i]->dim(3));
   }
+
+  std::vector<index_t> output_shape = input_tensors[0]->shape();
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+  output_tensor->ResizeImage(output_shape, output_image_shape);
+
+  AddN<T>(input_tensors, output_tensor);
 };
 
+template
+struct AddNFunctor<DeviceType::OPENCL, float>;
+
+template
+struct AddNFunctor<DeviceType::OPENCL, half>;
+
 }  // namespace kernels
-} //  namespace mace
+}  //  namespace mace
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index c7cd37e3ec7e6c1e0dbe31cf335bb105869e35c2..c17286895a8868732ada5608d9454cae31cdd746 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -11,8 +11,8 @@
 namespace mace {
 namespace kernels {
 
-template <>
-void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
+template <typename T>
+void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
     const Tensor *input,
     const Tensor *scale,
     const Tensor *offset,
@@ -21,35 +21,39 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
     const Tensor *epsilon,
     Tensor *output) {
 
-  index_t pixel_size = input->dim(2) * input->dim(3);
-  index_t blocks = (pixel_size + 3) / 4;
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
 
-  const uint32_t gws[3] = {static_cast<uint32_t>(input->dim(0)),
-                           static_cast<uint32_t>(input->dim(1)),
-                           static_cast<uint32_t>(blocks)};
+  const index_t channel_blocks = RoundUpDiv4(channels);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
 
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto dt = DataTypeToEnum<T>::value;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
   auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
 
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
-  const std::vector<uint32_t> lws = {1, 1, kwg_size};
+  const std::vector<uint32_t> lws = {1, kwg_size, 1};
 
   uint32_t idx = 0;
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(scale->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(offset->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(mean->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(var->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(scale->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(offset->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(mean->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(var->buffer())));
   bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(epsilon->buffer())));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(pixel_size));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-  bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr);
-  bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr);
+  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
 
   auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
-    return {{1, 1, 64},
+    return {{8, 128, 1}, //SNPE size
+            {1, 1, 64},
             {1, 1, 128},
             {1, kwg_size/16, 16},
             {1, kwg_size/32, 32},
@@ -80,5 +84,9 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
                                                      func);
 }
 
+template
+struct BatchNormFunctor<DeviceType::OPENCL, float>;
+template
+struct BatchNormFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 511e4598309561a5a453113784db9de4d933399b..f3af3d22622bd5e893347d958da76dbec71a450a 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -24,8 +24,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
   }
 
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype()));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(image->dtype()));
+  if (buffer->dtype() == image->dtype()) {
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
+  } else {
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+  }
   auto runtime = OpenCLRuntime::Get();
   string kernel_name;
   switch (type) {
diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl
index 55c8d0bf5d5ec32053c06eb9724e21156c99e35c..a93099303f8d2e6c6896c61c4a1978be1c222bbf 100644
--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
@@ -1,20 +1,33 @@
 #include <common.h>
 
-// Supported data type: half/float
-__kernel void add2(__global const DATA_TYPE *input0,
-                   __global const DATA_TYPE *input1,
-                   __private const int size,
-                   __global DATA_TYPE *output) {
-  int idx = get_global_id(0);
+__kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
+                   __read_only image2d_t input1,
+#if INPUT_NUM > 2
+                   __read_only image2d_t input2,
+#endif
+#if INPUT_NUM > 3
+                   __read_only image2d_t input3,
+#endif
+                   __write_only image2d_t output) {
+  const int w = get_global_id(0);
+  const int hb = get_global_id(1);
 
-  if (idx + 4 > size) {
-    for(; idx < size; ++idx) {
-      *(output+idx) = *(input0+idx) + *(input1+idx);
-    }
-  } else {
-    VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0);
-    VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1);
-    vstore4(in_data0+in_data1, idx, output);
-  }
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb));
+  DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb));
+  DATA_TYPE4 out = in0 + in1;
+
+#if INPUT_NUM > 2
+  DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb));
+  out = out + in2;
+#endif
+
+#if INPUT_NUM > 3
+  DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb));
+  out = out + in3;
+#endif
+
+  WRITE_IMAGET(output, (int2)(w, hb), out);
 }
 
diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl
index e6a52d491972b6efe5ec3ecec3f26792d66b76a6..d0ad2e2aca77a2cc0fb7a51a8a4671060842b077 100644
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -1,43 +1,28 @@
 #include <common.h>
 // Supported data types: half/float
-void kernel batch_norm(global const DATA_TYPE *input,
-                       global const DATA_TYPE *scale,
-                       global const DATA_TYPE *offset,
-                       global const DATA_TYPE *mean,
-                       global const DATA_TYPE *var,
-                       global const DATA_TYPE *epsilon,
-                       private const int pixels,
-                       global DATA_TYPE *output,
-                       __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_scale,
-                       __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_offset) {
-  const int batch = get_global_id(0);
-  const int channel = get_global_id(1);
-  const int channels = get_global_size(1);
-  const int pixel_offset = get_global_id(2);
-  const int local_channel = get_local_id(1);
-  const int local_pixel_idx = get_local_id(2);
+__kernel void batch_norm(__read_only image2d_t input,
+                         __read_only image2d_t scale,
+                         __read_only image2d_t offset,
+                         __read_only image2d_t mean,
+                         __read_only image2d_t var,
+                         __global const DATA_TYPE *epsilon,
+                         __write_only image2d_t output) {
+  const int ch_blk = get_global_id(0);
+  const int w = get_global_id(1);
+  const int hb = get_global_id(2);
+  const int width = get_global_size(1);
 
-  if(local_pixel_idx == 0) {
-    new_scale[local_channel] = (float4)(scale[channel] * rsqrt(var[channel] + *epsilon));
-    new_offset[local_channel] = (float4)(offset[channel] - mean[channel] * new_scale[local_channel].x);
-  }
+  DATA_TYPE4 scale_value = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
+  DATA_TYPE4 offset_value = READ_IMAGET(offset, SAMPLER, (int2)(ch_blk, 0));
+  DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0));
+  DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0));
 
-  barrier(CLK_LOCAL_MEM_FENCE);
+  DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)(*epsilon));
+  DATA_TYPE4 new_offset = offset_value - mean_value * new_scale;
 
-  const int image_offset = (batch * channels + channel) * pixels + pixel_offset*4;
-  const DATA_TYPE *input_ptr = input + image_offset;
-  DATA_TYPE *output_ptr = output + image_offset;
-  const int end = (batch * channels + channel + 1) * pixels;
-  if ((image_offset+4) > end) {
-    for (int i = image_offset; i < end; ++i) {
-      *output_ptr = new_scale[local_channel].x * *input_ptr + new_offset[local_channel].x;
-      ++input_ptr;
-      ++output_ptr;
-    }
-  } else {
-    VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input_ptr);
-    values = values * new_scale[local_channel] + new_offset[local_channel];
-    vstore4(values, 0, output_ptr);
-  }
-}
+  const int pos = ch_blk * width + w;
 
+  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
+  DATA_TYPE4 out = in * new_scale + new_offset;
+  WRITE_IMAGET(output, (int2)(pos, hb), out);
+}
diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h
index 7c156d8d2ebd44012a3ef0aab04ebcb3b549aee1..499c8164ddc3a0c5158c97e70c6f6ec55f0ccd87 100644
--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -14,4 +14,11 @@
 #define CMD_TYPE_STR(cmd, type) cmd##type
 #define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
 
+#define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4)
+#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
+#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
+
+
+__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
 #endif  // MACE_KERNELS_OPENCL_CL_COMMON_H_
diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl
new file mode 100644
index 0000000000000000000000000000000000000000..e5ddb3d78fd5d5176123e9b0ab5e4e460e035314
--- /dev/null
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -0,0 +1,148 @@
+#include <common.h>
+
+__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+                      __read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */
+#ifdef BIAS
+    __read_only image2d_t bias, /* cout%4 * cout/4 */
+#endif
+                      __write_only image2d_t output,
+                      __private const int in_height,
+                      __private const int in_width,
+                      __private const int in_ch_blks,
+                      __private const int out_height,
+                      __private const int out_width,
+                      __private const int filter_height,
+                      __private const int filter_width,
+                      __private const int padding_top,
+                      __private const int padding_left) {
+  const int out_ch_blk = get_global_id(0);
+  const int out_w_blk = get_global_id(1);
+  const int out_w_blks = get_global_size(1);
+  const int out_hb = get_global_id(2);
+  const int rounded_in_ch = in_ch_blks * 4;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+#ifdef BIAS
+  DATA_TYPE4 out0 =
+     READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+  DATA_TYPE4 out1 = out0;
+  DATA_TYPE4 out2 = out0;
+  DATA_TYPE4 out3 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+#endif
+
+#if STRIDE == 1
+  int in_width0 = out_w_blk - padding_left;
+  int in_width1 = in_width0 + out_w_blks;
+  int in_width2 = in_width1 + out_w_blks;
+  int in_width3 = in_width2 + out_w_blks;
+  const int height_idx = (out_hb % out_height) - padding_top;
+#else
+  int in_width0 = out_w_blk * 2 - padding_left;
+  int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
+  int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
+  int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
+  const int height_idx = (out_hb % out_height) * 2 - padding_top;
+#endif
+
+  const int batch_idx = (out_hb / out_height) * in_height;
+
+  DATA_TYPE4 in0, in1, in2, in3;
+  DATA_TYPE4 weights0, weights1, weights2, weights3;
+  int in_idx, in_width_idx;
+  // Unrolling this loop hurt perfmance
+  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
+
+      int in_hb_value = height_idx + hb_idx;
+      in_hb_value = select(in_hb_value + batch_idx,
+                           -1,
+                           (in_hb_value < 0 || in_hb_value >= in_height));
+
+      for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
+        in_idx = in_ch_blk * in_width;
+        int in_width_value;
+#define READ_INPUT(i)                                                                \
+        in_width_value = in_width##i + width_idx;                                    \
+        in_width_value = select(in_idx + in_width_value,                             \
+                                -1,                                                  \
+                                (in_width_value < 0 || in_width_value >= in_width)); \
+        in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
+
+        READ_INPUT(0);
+        READ_INPUT(1);
+        READ_INPUT(2);
+        READ_INPUT(3);
+
+#undef READ_INPUT
+
+        int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch;
+        weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
+        weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
+        weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
+        weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+
+        // Will prefetch L2 improve performance? How to pretch image data?
+
+        // Interleaving load and mul does not improve performance as expected
+        out0 += in0.x * weights0;
+        out0 += in0.y * weights1;
+        out0 += in0.z * weights2;
+        out0 += in0.w * weights3;
+
+        out1 += in1.x * weights0;
+        out1 += in1.y * weights1;
+        out1 += in1.z * weights2;
+        out1 += in1.w * weights3;
+
+        out2 += in2.x * weights0;
+        out2 += in2.y * weights1;
+        out2 += in2.z * weights2;
+        out2 += in2.w * weights3;
+
+        out3 += in3.x * weights0;
+        out3 += in3.y * weights1;
+        out3 += in3.z * weights2;
+        out3 += in3.w * weights3;
+
+      }
+    }
+  }
+
+#ifdef FUSED_RELU
+  // TODO relux
+  out0 = fmax(out0, 0);
+  out1 = fmax(out1, 0);
+  out2 = fmax(out2, 0);
+  out3 = fmax(out3, 0);
+#endif
+
+  const int out_x_base = out_ch_blk * out_width;
+  int w = out_w_blk;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out0);
+
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out1);
+
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out2);
+
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out3);
+
+}
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index 56f2cedc5e1f2427fcea57b91b9150e049f618ba..bf3844679006dc5b594a919b2ffa86b324903fc7 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -1,151 +1,15 @@
 #include <common.h>
 
-#define vec_conv_2d_1x1_s1                    \
-  VEC_DATA_TYPE(DATA_TYPE,4) in0 = vload4(0, input_ptr);                   \
-  VEC_DATA_TYPE(DATA_TYPE,4) in1 = vload4(0, input_ptr + in_pixel);        \
-  VEC_DATA_TYPE(DATA_TYPE,4) in2 = vload4(0, input_ptr + 2 * in_pixel);    \
-  VEC_DATA_TYPE(DATA_TYPE,4) in3 = vload4(0, input_ptr + 3 * in_pixel);
-
-
-#define vec_conv_2d_1x1_s2                    \
-  VEC_DATA_TYPE(DATA_TYPE,4) in00 = vload4(0, input_ptr);                   \
-  VEC_DATA_TYPE(DATA_TYPE,3) in01 = vload3(0, input_ptr + 4);               \
-  VEC_DATA_TYPE(DATA_TYPE,4) in10 = vload4(0, input_ptr + in_pixel);        \
-  VEC_DATA_TYPE(DATA_TYPE,3) in11 = vload3(0, input_ptr + in_pixel + 4);    \
-  VEC_DATA_TYPE(DATA_TYPE,4) in20 = vload4(0, input_ptr + 2 * in_pixel);    \
-  VEC_DATA_TYPE(DATA_TYPE,3) in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\
-  VEC_DATA_TYPE(DATA_TYPE,4) in30 = vload4(0, input_ptr + 3 * in_pixel);    \
-  VEC_DATA_TYPE(DATA_TYPE,3) in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \
-  VEC_DATA_TYPE(DATA_TYPE,4) in0 = (VEC_DATA_TYPE(DATA_TYPE,4))(in00.s02, in01.s02);            \
-  VEC_DATA_TYPE(DATA_TYPE,4) in1 = (VEC_DATA_TYPE(DATA_TYPE,4))(in10.s02, in11.s02);            \
-  VEC_DATA_TYPE(DATA_TYPE,4) in2 = (VEC_DATA_TYPE(DATA_TYPE,4))(in20.s02, in21.s02);            \
-  VEC_DATA_TYPE(DATA_TYPE,4) in3 = (VEC_DATA_TYPE(DATA_TYPE,4))(in30.s02, in31.s02);
-
-
-#define vec_conv_2d_1x1_compute_loop  \
-  for (int oc = 0; oc < 4; ++oc) {                             \
-    VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr + oc * in_chan_num); \
-    VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr + oc * out_pixel);       \
-    out += in0 * weights.x;                                    \
-    out += in1 * weights.y;                                     \
-    out += in2 * weights.z;                                     \
-    out += in3 * weights.w;                                     \
-    vstore4(out, 0, output_ptr + oc * out_pixel);               \
-  }
-
-#define vec_conv_2d_1x1_compute  \
-    VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr); \
-    VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr);       \
-    out += in0 * weights.x;                                    \
-    out += in1 * weights.y;                                     \
-    out += in2 * weights.z;                                     \
-    out += in3 * weights.w;                                     \
-    vstore4(out, 0, output_ptr);
-
-// Supported data type: half/float
-__kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
-                             __global const DATA_TYPE *filter, /* o, i, kh, kw */
-#ifdef BIAS
-                             __global const DATA_TYPE *bias, /* o */
-#endif /* defined(BIAS) */
-                             __global DATA_TYPE *output, /* n, c, h, w */
-                             __private const int in_chan_num,
-                             __private const int out_chan_num,
-                             __private const int in_height,
-                             __private const int in_width,
-                             __private const int out_height,
-                             __private const int out_width) {
-  int batch = get_global_id(0);
-  int out_chan_blk = get_global_id(1);
-  int out_pixel_blk = get_global_id(2);
-
-  const int in_pixel = in_height * in_width;
-  const int out_pixel = out_height * out_width;
-
-  const int round_out_width = (out_width + 3) / 4;
-  const int out_pixel_height = out_pixel_blk / round_out_width;
-  const int out_pixel_width = out_pixel_blk % round_out_width;
-
-  const int out_chan_begin = out_chan_blk * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
-  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-
-#ifdef STRIDE_1
-  const int stride = 1;
-#else
-  const int stride = 2;
-#endif
-  const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
-
-  const int in_offset = batch * in_chan_num * in_pixel;
-  const int out_offset = batch * out_chan_num * out_pixel;
-
-  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
-  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
-
-  int out_chan_len = out_chan_end - out_chan_begin;
-  int pixel_len = out_pixel_end - out_pixel_begin;
-
-  for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
-    DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-#ifdef BIAS
-    DATA_TYPE bias_value = bias[out_chan];
-#else
-    DATA_TYPE bias_value = 0;
-#endif
-    for (int p = 0; p < pixel_len; ++p) {
-      output_ptr[p] = bias_value;
-    }
-  }
-
-  int in_chan = 0;
-  if (pixel_len == 4) {
-    for (; in_chan + 3 < in_chan_num; in_chan += 4) {
-      const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
-      int out_chan = out_chan_begin;
-      for (; out_chan + 3 < out_chan_end; out_chan += 4) {
-        const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-        DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-#ifdef STRIDE_1
-        vec_conv_2d_1x1_s1;
-#else
-        vec_conv_2d_1x1_s2;
-#endif
-        vec_conv_2d_1x1_compute_loop;
-      }
-      for (; out_chan < out_chan_end; ++out_chan) {
-        const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-        DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-#ifdef STRIDE_1
-        vec_conv_2d_1x1_s1;
-#else
-        vec_conv_2d_1x1_s2;
-#endif
-        vec_conv_2d_1x1_compute;
-      }
-    }
-  }
-
-  for (; in_chan < in_chan_num; ++in_chan) {
-    const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
-    for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
-      DATA_TYPE weights = filter[out_chan * in_chan_num + in_chan];
-      DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-
-      for (int p = 0; p < pixel_len; ++p) {
-        float in = input_ptr[p*stride];
-        output_ptr[p] += in * weights;
-      }
-    }
-  }
-}
-
 __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                           __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
+#ifdef BIAS
                           __read_only image2d_t bias, /* cout%4 * cout/4 */
+#endif
                           __write_only image2d_t output,
+                          __private const int in_height,
+                          __private const int in_width,
                           __private const int in_ch_blks,
+                          __private const int height,
                           __private const int width) {
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
@@ -154,151 +18,103 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 
   const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-  half4 bias_value = read_imageh(bias, sampler, (int2)(out_ch_blk, 0));
-  half4 out[4];
-  out[0] = (half4)(bias_value.x);
-  out[1] = (half4)(bias_value.y);
-  out[2] = (half4)(bias_value.z);
-  out[3] = (half4)(bias_value.w);
-
-  int w[4];
-  w[0] = out_w_blk;
-  w[1] = w[0] + out_w_blks;
-  w[2] = w[1] + out_w_blks;
-  w[3] = w[2] + out_w_blks;
-
-  // Unrolling this loop hurt perfmance
-  int in_x_base = 0;
-  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    half4 in[4];
-    in[0] = read_imageh(input, sampler, (int2)(in_x_base + w[0], out_hb));
-    if (w[1] < width) {
-      // conditional load hurt perf, this branching helps sometimes
-      in[1] = read_imageh(input, sampler, (int2)(in_x_base + w[1], out_hb));
-      in[2] = read_imageh(input, sampler, (int2)(in_x_base + w[2], out_hb));
-      in[3] = read_imageh(input, sampler, (int2)(in_x_base + w[3], out_hb));
-    }
-
-    // The order matters, load input first then load filter, why?
-    const int filter_x0 = in_ch_blk << 2;
-    half4 weights[4];
-    #pragma unroll
-    for (int c = 0; c < 4; ++c) {
-      weights[c] = read_imageh(filter, sampler, (int2)(filter_x0 + c, out_ch_blk));
-    }
-    // Will prefetch L2 improve performance? How to pretch image data?
-
-    // Interleaving load and mul does not improve performance as expected
-    #pragma unroll
-    for (int c = 0; c < 4; ++c) {
-      out[c] += in[c].x * weights[0];
-      out[c] += in[c].y * weights[1];
-      out[c] += in[c].z * weights[2];
-      out[c] += in[c].w * weights[3];
-    }
-
-    in_x_base += width;
-  }
-
-  const int out_x_base = out_ch_blk * width;
-  write_imageh(output, (int2)(out_x_base + w[0], out_hb), out[0]);
-
-  if (w[1] >= width) return;
-  write_imageh(output, (int2)(out_x_base + w[1], out_hb), out[1]);
-
-  if (w[2] >= width) return;
-  write_imageh(output, (int2)(out_x_base + w[2], out_hb), out[2]);
-
-  if (w[3] >= width) return;
-  write_imageh(output, (int2)(out_x_base + w[3], out_hb), out[3]);
-}
-
-__kernel void conv_2d_1x1_h8(__read_only image2d_t input, /* [c%8 * w * c/8, h * b] */
-                             __read_only image2d_t filter, /* cout%8 * cin, cout/8 */
-                             __read_only image2d_t bias, /* cout%8 * cout/8 */
-                             __write_only image2d_t output,
-                             __private const int in_ch_blks,
-                             __private const int width) {
-  const int out_ch_blk = get_global_id(0);
-  const int out_w_blk = get_global_id(1);
-  const int out_w_blks = get_global_size(1);
-  const int out_hb = get_global_id(2);
+#ifdef BIAS
+  DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+  DATA_TYPE4 out1 = out0;
+  DATA_TYPE4 out2 = out0;
+  DATA_TYPE4 out3 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+#endif
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int4 w;
+#if STRIDE == 1
+  w.x = out_w_blk;
+  w.y = w.x + out_w_blks;
+  w.z = w.y + out_w_blks;
+  w.w = w.z + out_w_blks;
+  int out_hb_idx = (out_hb % height);
+#else
+  w.x = out_w_blk * 2;
+  w.y = (out_w_blk + out_w_blks) * 2;
+  w.z = (out_w_blk + 2 * out_w_blks) * 2;
+  w.w = (out_w_blk + 3 * out_w_blks) * 2;
+  int out_hb_idx = (out_hb % height) * 2;
+#endif
 
-  float4 bias_value = read_imagef(bias, sampler, (int2)(out_ch_blk, 0));
-  half4 bias_value03 = as_half4(bias_value.xy);
-  half4 bias_value47 = as_half4(bias_value.zw);
-  half4 out[8];
-  out[0] = (half4)(bias_value03.x);
-  out[1] = (half4)(bias_value03.y);
-  out[2] = (half4)(bias_value03.z);
-  out[3] = (half4)(bias_value03.w);
-  out[4] = (half4)(bias_value47.x);
-  out[5] = (half4)(bias_value47.y);
-  out[6] = (half4)(bias_value47.z);
-  out[7] = (half4)(bias_value47.w);
+  w.x = select(w.x, INT_MIN, w.x >= in_width);
+  w.y = select(w.y, INT_MIN, w.y >= in_width);
+  w.z = select(w.z, INT_MIN, w.z >= in_width);
+  w.w = select(w.w, INT_MIN, w.w >= in_width);
 
-  int w[4];
-  w[0] = out_w_blk;
-  w[1] = w[0] + out_w_blks;
-  w[2] = w[1] + out_w_blks;
-  w[3] = w[2] + out_w_blks;
+  out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
+                      -1,
+                      out_hb_idx >= in_height);
 
   // Unrolling this loop hurt perfmance
   int in_x_base = 0;
   for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    half4 in[8];
-    #pragma unroll
-    for (int wi = 0; wi < 4; ++wi) {
-      float4 in_value = read_imagef(input, sampler, (int2)(in_x_base + w[0], out_hb));
-      in[wi << 1] = as_half4(in_value.xy);
-      in[wi << 1 + 1] = as_half4(in_value.zw);
-    }
 
-    // The order matters, load input first then load filter, why?
+    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
+    DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
+    DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
+    DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
+
     const int filter_x0 = in_ch_blk << 2;
-    half4 weights[8];
-    #pragma unroll
-    for (int wi = 0; wi < 4; ++wi) {
-      float4 weights_value = read_imagef(filter, sampler, (int2)(filter_x0 + wi, out_ch_blk));
-      weights[wi << 1] = as_half4(weights_value.xy);
-      weights[wi << 1 + 1] = as_half4(weights_value.zw);
-    }
+    DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
+    DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
+    DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
+    DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
     // Will prefetch L2 improve performance? How to pretch image data?
 
-    // Interleaving load and mul does not improve performance as expected
-    #pragma unroll
-    for (int wi = 0; wi < 4; ++wi) {
-      int idx = wi << 1;
-      out[idx] += in[idx].x * weights[0];
-      out[idx] += in[idx].y * weights[1];
-      out[idx] += in[idx].z * weights[2];
-      out[idx] += in[idx].w * weights[3];
+    out0 += in0.x * weights0;
+    out0 += in0.y * weights1;
+    out0 += in0.z * weights2;
+    out0 += in0.w * weights3;
+
+    out1 += in1.x * weights0;
+    out1 += in1.y * weights1;
+    out1 += in1.z * weights2;
+    out1 += in1.w * weights3;
 
-      ++idx;
-      out[idx] += in[idx].x * weights[4];
-      out[idx] += in[idx].y * weights[5];
-      out[idx] += in[idx].z * weights[6];
-      out[idx] += in[idx].w * weights[7];
-    }
+    out2 += in2.x * weights0;
+    out2 += in2.y * weights1;
+    out2 += in2.z * weights2;
+    out2 += in2.w * weights3;
 
-    in_x_base += width;
+    out3 += in3.x * weights0;
+    out3 += in3.y * weights1;
+    out3 += in3.z * weights2;
+    out3 += in3.w * weights3;
+
+    in_x_base += in_width;
   }
 
+#ifdef FUSED_RELU
+  // TODO relux
+  out0 = fmax(out0, 0);
+  out1 = fmax(out1, 0);
+  out2 = fmax(out2, 0);
+  out3 = fmax(out3, 0);
+#endif
+
   const int out_x_base = out_ch_blk * width;
-  float4 out_value = (float4)(as_float2(out[0]), as_float2(out[1]));
-  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+  int out_x_idx = out_w_blk;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
+
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1);
 
-  if (w[1] >= width) return;
-  out_value = (float4)(as_float2(out[2]), as_float2(out[3]));
-  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2);
 
-  if (w[2] >= width) return;
-  out_value = (float4)(as_float2(out[4]), as_float2(out[5]));
-  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
 
-  if (w[3] >= width) return;
-  out_value = (float4)(as_float2(out[6]), as_float2(out[7]));
-  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
 }
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 33d7305b6e8ebb77d97071616fa5dfa9eb7c3a5d..08bf04d3c883e12f5970cd82a9394620b2649e51 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                           __write_only image2d_t output,
                           __private const int in_height,
                           __private const int in_width,
-                          __private const int in_channels,
+                          __private const int in_ch_blks,
                           __private const int out_height,
                           __private const int out_width,
                           __private const int padding_top,
@@ -17,120 +17,145 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   const int out_w_blk = get_global_id(1);
   const int out_w_blks = get_global_size(1);
   const int out_hb = get_global_id(2);
-  const int in_ch_blks = (in_channels + 3) / 4;
   const int rounded_in_ch = in_ch_blks * 4;
 
   const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  VEC_DATA_TYPE(DATA_TYPE, 4) out[4] = {0};
 #ifdef BIAS
-  out[0] =
-      CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0));
-  out[1] = out[0];
-  out[2] = out[0];
-  out[3] = out[0];
+  DATA_TYPE4 out0 =
+     READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+  DATA_TYPE4 out1 = out0;
+  DATA_TYPE4 out2 = out0;
+  DATA_TYPE4 out3 = out0;
+  DATA_TYPE4 out4 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+  DATA_TYPE4 out4 = 0;
+#endif
+
+#if STRIDE == 1
+  int in_width0 = out_w_blk - padding_left;
+  int in_width1 = in_width0 + out_w_blks;
+  int in_width2 = in_width1 + out_w_blks;
+  int in_width3 = in_width2 + out_w_blks;
+  int in_width4 = in_width3 + out_w_blks;
+  const int height_idx = (out_hb % out_height) - padding_top;
+#else
+  int in_width0 = out_w_blk * 2 - padding_left;
+  int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
+  int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
+  int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
+  int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left;
+  const int height_idx = (out_hb % out_height) * 2 - padding_top;
 #endif
 
-  int w[4];
-  w[0] = out_w_blk - padding_left;
-  w[1] = w[0] + out_w_blks;
-  w[2] = w[1] + out_w_blks;
-  w[3] = w[2] + out_w_blks;
-
-  const int batch_idx = out_hb / out_height;
-  const int height_idx = out_hb % out_height;
-  int in_hb[3];
-  in_hb[0] = height_idx - padding_top;
-  in_hb[1] = in_hb[0] + 1;
-  in_hb[2] = in_hb[1] + 1;
-  // Judge the height border for padding input.
-  in_hb[0] = (in_hb[0] < 0 || in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height;
-  in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height;
-  in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height;
-
-  const int input_image_width = in_ch_blks * in_width;
+  const int batch_idx = (out_hb / out_height) * in_height;
 
+  DATA_TYPE4 in0, in1, in2, in3, in4;
+  DATA_TYPE4 weights0, weights1, weights2, weights3;
+  int in_idx, hb_idx, width_idx, in_width_idx;
   // Unrolling this loop hurt perfmance
-  int idx = 0;
-  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    VEC_DATA_TYPE(DATA_TYPE, 4) in[36];
-    VEC_DATA_TYPE(DATA_TYPE, 4) weights[36];
-
-    int filter_idx = in_ch_blk << 2;
-    int in_idx = in_ch_blk * in_width;
-
-    #pragma unroll
-    for (int i = 0; i < 3; ++i) {
-      for (int j = 0; j < 3; ++j) {
-        idx = i * 12 + j * 4;
-        int in_width_idx = w[0] + j;
-        // Judge the width border for padding input.
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 0] = 0;
-        } else {
-          in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
-        }
-        in_width_idx = w[1] + j;
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 1] = 0;
-        } else {
-          in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
-        }
-        in_width_idx = w[2] + j;
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 2] = 0;
-        } else {
-          in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
-        }
-        in_width_idx = w[3] + j;
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 3] = 0;
-        } else {
-          in[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
-        }
-
-        weights[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-        weights[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
-        weights[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
-        weights[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
-
-        filter_idx += rounded_in_ch;
-      }
-    }
-    // Will prefetch L2 improve performance? How to pretch image data?
-
-    // Interleaving load and mul does not improve performance as expected
-    #pragma unroll
-    for (int c = 0; c < 4; ++c) {
-      for (int i = 0; i < 9; ++i) {
-        out[c] += in[c + i * 4].x * weights[0 + i * 4];
-        out[c] += in[c + i * 4].y * weights[1 + i * 4];
-        out[c] += in[c + i * 4].z * weights[2 + i * 4];
-        out[c] += in[c + i * 4].w * weights[3 + i * 4];
+  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
+      int in_hb_value = height_idx + hb_idx;
+      in_hb_value = select(in_hb_value + batch_idx,
+                           -1,
+                           (in_hb_value < 0 || in_hb_value >= in_height));
+      for (short width_idx = 0; width_idx < 3; ++width_idx) {
+
+        in_idx = in_ch_blk * in_width;
+        int in_width_value;
+#define READ_INPUT(i)                                                                \
+        in_width_value = in_width##i + width_idx;                                    \
+        in_width_value = select(in_idx + in_width_value,                             \
+                                -1,                                                  \
+                                (in_width_value < 0 || in_width_value >= in_width)); \
+        in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
+
+        READ_INPUT(0);
+        READ_INPUT(1);
+        READ_INPUT(2);
+        READ_INPUT(3);
+        READ_INPUT(4);
+
+#undef READ_INPUT
+
+        int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
+        weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
+        weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
+        weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
+        weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+
+        // Will prefetch L2 improve performance? How to pretch image data?
+
+        // Interleaving load and mul does not improve performance as expected
+        out0 += in0.x * weights0;
+        out0 += in0.y * weights1;
+        out0 += in0.z * weights2;
+        out0 += in0.w * weights3;
+
+        out1 += in1.x * weights0;
+        out1 += in1.y * weights1;
+        out1 += in1.z * weights2;
+        out1 += in1.w * weights3;
+
+        out2 += in2.x * weights0;
+        out2 += in2.y * weights1;
+        out2 += in2.z * weights2;
+        out2 += in2.w * weights3;
+
+        out3 += in3.x * weights0;
+        out3 += in3.y * weights1;
+        out3 += in3.z * weights2;
+        out3 += in3.w * weights3;
+
+        out4 += in4.x * weights0;
+        out4 += in4.y * weights1;
+        out4 += in4.z * weights2;
+        out4 += in4.w * weights3;
       }
     }
   }
 
+#ifdef FUSED_RELU
+  // TODO relux
+  out0 = fmax(out0, 0);
+  out1 = fmax(out1, 0);
+  out2 = fmax(out2, 0);
+  out3 = fmax(out3, 0);
+  out4 = fmax(out4, 0);
+#endif
+
   const int out_x_base = out_ch_blk * out_width;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[0] + padding_left, out_hb),
-                                       out[0]);
-
-  w[1] += padding_left;
-  if (w[1] >= out_width) return;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[1], out_hb),
-                                       out[1]);
-
-  w[2] += padding_left;
-  if (w[2] >= out_width) return;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[2], out_hb),
-                                       out[2]);
-
-  w[3] += padding_left;
-  if (w[3] >= out_width) return;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[3], out_hb),
-                                       out[3]);
+  int w = out_w_blk;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out0);
+
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out1);
+
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out2);
+
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out3);
+
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w, out_hb),
+               out4);
+
 }
diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl
index bc987dddb96ec5202fb3882fc279d4480c077f38..bd2763fc8c7eb68c06e09b4822ccc5025807a151 100644
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -1,193 +1,87 @@
 #include <common.h>
 
-VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) {
-  VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr);
-  VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4);
-  VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width);
-  VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4);
-  VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2);
-  VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4);
-
-  VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323);
-  VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01);
-  VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323);
-  VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01);
-  VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323);
-  VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01);
-
-  VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
-  VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
-
-  VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
-                                        (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
-  res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
-
-  return res;
-}
-
-VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) {
-  VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr);
-  DATA_TYPE row01 = *(input_ptr + 8);
-  VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width);
-  DATA_TYPE row11 = *(input_ptr + in_width + 8);
-  VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2);
-  DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8);
-
-  VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445);
-  VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01);
-  VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445);
-  VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11);
-  VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445);
-  VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21);
-
-  VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
-  VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
-
-  VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
-                                        (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
-  res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
-
-  return res;
-}
-
-DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) {
-  VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr);
-  VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width);
-  VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2);
-
-  VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2);
-
-  DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2);
-  return res;
-}
-
-// Supported data type: half/float
-__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */
-                       __private const int in_height,
-                       __private const int in_width,
-                       __private const int out_chan_num,
-                       __private const int out_height,
-                       __private const int out_width,
-                       __private const int stride,
-                       __global DATA_TYPE *output) {
-  int batch = get_global_id(0);
-  int out_chan_blk = get_global_id(1);
-  int out_pixel_blk = get_global_id(2);
-
-  const int round_out_width = (out_width + 3) / 4;
-  const int out_pixel_height = out_pixel_blk / round_out_width;
-  const int out_pixel_width = out_pixel_blk % round_out_width;
-
-  const int out_chan_begin = out_chan_blk * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
-  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-  const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
-
-  const int in_pixel = in_height * in_width;
-  const int out_pixel = out_height * out_width;
-
-  const int in_offset = batch * out_chan_num * in_pixel;
-  const int out_offset = batch * out_chan_num * out_pixel;
-  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
-  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
-
-  const int pixels = out_pixel_end - out_pixel_begin;
-
-  for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    const DATA_TYPE *input_ptr = input_base + i * in_pixel;
-    DATA_TYPE *output_ptr = output_base + i * out_pixel;
-    if (pixels == 4) {
-      VEC_DATA_TYPE(DATA_TYPE,4) res;
-#ifdef STRIDE_1
-      res = vec_pooling_3_s1(input_ptr, in_width);
+#ifdef FP16
+#define MIN_VALUE -USHRT_MAX
 #else
-      res = vec_pooling_3_s2(input_ptr, in_width);
+#define MIN_VALUE -FLT_MAX
 #endif
-      vstore4(res, 0, output_ptr);
-    } else {
-      for (int p = 0; p < pixels; ++p) {
-        output_ptr[p] = inner_pooling_3(input_ptr, in_width);
-        input_ptr += stride;
-      }
-    }
-  }
-}
 
-int calculate_avg_block_size(const int pos_h,
-                             const int pos_w,
-                             const int pool_size,
-                             const int pad_h,
-                             const int pad_w,
-                             const int h_size,
-                             const int w_size) {
-  const int h_start = max(0, pos_h - pad_h);
-  const int w_start = max(0, pos_w - pad_w);
-  const int h_end = min(pos_h + pool_size - pad_h, h_size);
-  const int w_end = min(pos_w + pool_size - pad_w, w_size);
+inline int calculate_avg_block_size(const int pool_size,
+                                    const int pos_h,
+                                    const int pos_w,
+                                    const int h_size,
+                                    const int w_size) {
+  const int h_start = max(0, pos_h);
+  const int w_start = max(0, pos_w);
+  const int h_end = min(pos_h + pool_size, h_size);
+  const int w_end = min(pos_w + pool_size, w_size);
   return (h_end - h_start) * (w_end - w_start);
 }
 
 // Supported data type: half/float
-__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */
-                       __private const int in_height,
-                       __private const int in_width,
-                       __private const int out_chan_num,
-                       __private const int out_height,
-                       __private const int out_width,
-                       __private const int stride,
-                       __private const int pad_h,
-                       __private const int pad_w,
-                       __private const int pooling_size,
-                       __global DATA_TYPE *output) {
-  int batch = get_global_id(0);
-  int out_chan_idx = get_global_id(1);
-  int out_pixel_idx = get_global_id(2);
-
-  const int out_pixel_height = out_pixel_idx / out_width;
-  const int out_pixel_width = out_pixel_idx % out_width;
-
-  const int out_chan_begin = out_chan_idx * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int in_pixel_idx = out_pixel_height * stride * in_width
-                             + out_pixel_width * stride;
-
-  const int in_pixel = in_height * in_width;
-  const int out_pixel = out_height * out_width;
-
-  const int in_offset = batch * out_chan_num * in_pixel;
-  const int out_offset = batch * out_chan_num * out_pixel;
-  const DATA_TYPE *input_base = input + in_offset + in_pixel_idx;
-  DATA_TYPE *output_base = output + out_offset + out_pixel_idx;
-
-  const int block_size = calculate_avg_block_size(
-                            out_pixel_height * stride,
-                            out_pixel_width * stride,
-                            pooling_size,
-                            pad_h/2,
-                            pad_w/2,
-                            in_height - pad_h,
-                            in_width - pad_w);
-  for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f;
-    DATA_TYPE sum1 = 0.0f;
-    DATA_TYPE *output_ptr = output_base + i * out_pixel;
-    for (int y = 0; y < pooling_size; ++y) {
-      const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width;
-      int x = 0;
-      for (; x < (pooling_size-8); x += 8) {
-        VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr);
-        sum8 += data;
-        input_ptr += 8;
-      }
-      for (; x < pooling_size; ++x) {
-        sum1 += *input_ptr;
-        input_ptr++;
+__kernel void pooling(__read_only image2d_t input,
+                      __private const int in_height,
+                      __private const int in_width,
+                      __private const int out_height,
+                      __private const int pad_top,
+                      __private const int pad_left,
+                      __private const int stride,
+                      __private const int pooling_size,
+                      __write_only image2d_t output) {
+  const int out_chan_idx = get_global_id(0);
+  const int out_width_idx = get_global_id(1);
+  const int out_width = get_global_size(1);
+  const int out_hb_idx = get_global_id(2);
+
+  const int batch_idx = (out_hb_idx / out_height) * in_height;
+  const int in_height_start = (out_hb_idx % out_height) * stride - pad_top;
+  const int in_width_start = out_width_idx * stride - pad_left;
+  const int in_channel_offset = out_chan_idx * in_width;
+
+
+#ifdef POOL_AVG
+  DATA_TYPE4 res = 0;
+  for (int height = 0; height < pooling_size; ++height) {
+    int in_height_idx = in_height_start + height;
+    in_height_idx = select(batch_idx + in_height_idx,
+                       -1,
+                       (in_height_idx < 0 || in_height_idx >= in_height));
+    for (int width = 0; width < pooling_size; ++width) {
+      int in_width_idx = in_width_start + width;
+      in_width_idx = select(in_channel_offset + in_width_idx,
+                            -1,
+                            (in_width_idx < 0 || in_width_idx >= in_width));
+
+      DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
+      res = res + in;
+    }
+  }
+  const int block_size = calculate_avg_block_size(pooling_size,
+                                                  in_height_start, in_width_start,
+                                                  in_height, in_width);
+  res /= block_size;
+#else
+  DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE);
+  for (int height = 0; height < pooling_size; ++height) {
+    int in_height_idx = in_height_start + height;
+    in_height_idx = select(batch_idx + in_height_idx,
+                           -1,
+                           (in_height_idx < 0 || in_height_idx >= in_height));
+    if (in_height_idx != -1) {
+      for (int width = 0; width < pooling_size; ++width) {
+        int in_width_idx = in_width_start + width;
+        in_width_idx = select(in_channel_offset + in_width_idx,
+                              -1,
+                              (in_width_idx < 0 || in_width_idx >= in_width));
+
+        if (in_width_idx != -1) {
+          DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
+          res = fmax(res, in);
+        }
       }
     }
-    VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567;
-    VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23;
-
-    *output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size;
   }
+#endif
+
+  WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res);
 }
diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl
index f34e63cbf07b1a360957fcf5eaf74661ec22b8c1..efb769d27b7ab7836d0681c2b84775047942805a 100644
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
@@ -1,18 +1,19 @@
 #include <common.h>
 
-// Supported data type: half/float
-__kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, h, w */
-                                      __global DATA_TYPE *output /* n * c, h, w */,
+__kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+                                      __write_only image2d_t output,
                                       __private const float height_scale,
                                       __private const float width_scale,
                                       __private const int in_height,
-                                      __private const int in_width) {
-  const int c = get_global_id(0);
-  const int h = get_global_id(1);
-  const int w = get_global_id(2);
-  const int channels = get_global_size(0);
-  const int height = get_global_size(1);
-  const int width = get_global_size(2);
+                                      __private const int in_width,
+                                      __private const int out_height) {
+  const int ch_blk = get_global_id(0);
+  const int ch_blks = get_global_size(0);
+  const int w = get_global_id(1);
+  const int out_width = get_global_size(1);
+  const int hb = get_global_id(2);
+  const int b = hb / out_height;
+  const int h = hb % out_height;
 
   const float h_in = h * height_scale;
   const float w_in = w * width_scale;
@@ -24,16 +25,26 @@ __kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c,
   const float h_lerp = h_in - h_lower;
   const float w_lerp = w_in - w_lower;
 
-  const DATA_TYPE *input_base = input + c * in_height * in_width;
-  DATA_TYPE *output_base = output + c * height * width;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const int in_w_offset = ch_blk * in_width;
+  const int in_h_offset = b * in_height;
 
-  DATA_TYPE top_left = input_base[h_lower * in_width + w_lower];
-  DATA_TYPE top_right = input_base[h_lower * in_width + w_upper];
-  DATA_TYPE bottom_left = input_base[h_upper * in_width + w_lower];
-  DATA_TYPE bottom_right = input_base[h_upper * in_width + w_upper];
+  DATA_TYPE4 top_left = READ_IMAGET(input, sampler,
+          (int2)(in_w_offset + w_lower, in_h_offset + h_lower));
+  DATA_TYPE4 top_right = READ_IMAGET(input, sampler,
+          (int2)(in_w_offset + w_upper, in_h_offset + h_lower));
+  DATA_TYPE4 bottom_left = READ_IMAGET(input, sampler,
+          (int2)(in_w_offset + w_lower, in_h_offset + h_upper));
+  DATA_TYPE4 bottom_right = READ_IMAGET(input, sampler,
+          (int2)(in_w_offset + w_upper, in_h_offset + h_upper));
 
-  const DATA_TYPE top = top_left + (top_right - top_left) * w_lerp;
-  const DATA_TYPE bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
-  output_base[h * width + w] = top + (bottom - top) * h_lerp;
+  DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp;
+  DATA_TYPE4 bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
+
+  DATA_TYPE4 out = top + (bottom - top) * h_lerp;
+
+  const int out_w_offset = ch_blk * out_width;
+  const int out_h_offset = b * out_height;
+  WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
 }
 
diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc
index 528928e618abf37a0220ed1d9ebf6a5a7c602564..c40481543796215c80f4367e8e5f01a59b32c3be 100644
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -9,50 +9,56 @@ namespace mace {
 namespace kernels {
 
 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
-                               const Tensor *bias, const int *padding,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
                                Tensor *output);
 
 extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
-                               const Tensor *bias, const int *padding,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
                                Tensor *output);
 
 extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
-                               const Tensor *bias, const int *padding,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
                                Tensor *output);
 
 extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
-                               const Tensor *bias, const int *padding,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
                                Tensor *output);
 
-template <>
-void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
-                                                          const Tensor *filter,
-                                                          const Tensor *bias,
-                                                          Tensor *output) {
+extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
+                         const Tensor *bias, const bool fused_relu,
+                         const uint32_t stride, const int *padding,
+                         const DataType dt, Tensor *output);
+
+template<typename T>
+void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+                                                      const Tensor *filter,
+                                                      const Tensor *bias,
+                                                      Tensor *output) {
   typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
-                                       const Tensor *bias, const int *padding,
+                                       const Tensor *bias, const bool fused_relu,
+                                       const int *padding, const DataType dt,
                                        Tensor *output);
   // Selection matrix: kernel_size x stride_size
   static const Conv2dOpenclFunction selector[5][2] = {
       {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
       {nullptr, nullptr},
-      {Conv2dOpenclK3x3S1, nullptr},
+      {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
       {nullptr, nullptr},
       {nullptr, nullptr}};
 
   index_t kernel_h = filter->dim(0);
   index_t kernel_w = filter->dim(1);
-  if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
-      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
-      selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
+  if (!input->is_image() || strides_[0] != strides_[1] ||
+      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) {
     LOG(WARNING) << "OpenCL conv2d kernel with "
                  << "filter" << kernel_h << "x" << kernel_w << ","
                  << " stride " << strides_[0] << "x" << strides_[1]
                  << " is not implemented yet, using slow version";
-    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
-    Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
-        input, filter, bias, output);
-    return;
+    MACE_NOT_IMPLEMENTED;
   }
 
   std::vector<index_t> output_shape(4);
@@ -61,17 +67,24 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
       input->shape().data(), filter->shape().data(), dilations_,
       strides_, paddings_, output_shape.data(), paddings.data());
 
-  if (input->is_image()) {
-    std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
-    output->ResizeImage(output_shape, output_image_shape);
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+  output->ResizeImage(output_shape, output_image_shape);
+
+  if (kernel_h == kernel_w && kernel_h <= 5 &&
+      selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
+    auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
+    conv2d_func(input, filter, bias, false, paddings.data(), DataTypeToEnum<T>::value, output);
   } else {
-    output->Resize(output_shape);
+    Conv2dOpencl(input, filter, bias, false, strides_[0], paddings.data(), DataTypeToEnum<T>::value, output);
   }
 
-  auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-  conv2d_func(input, filter, bias, paddings.data(), output);
 }
 
+template
+struct Conv2dFunctor<DeviceType::OPENCL, float>;
+template
+struct Conv2dFunctor<DeviceType::OPENCL, half>;
+
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 28f57f484a8e1b29acfefa6f021281f2030cab31..d759689c6dc1ee8ffbfa98f2a4a58577a50c4271 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -5,83 +5,44 @@
 #include "mace/kernels/conv_2d.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/utils/utils.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 namespace kernels {
 
-void Conv1x1V2(const Tensor *input,
-               const Tensor *filter,
-               const Tensor *bias,
-               const int stride,
-               Tensor *output) {
+void Conv1x1(const Tensor *input,
+             const Tensor *filter,
+             const Tensor *bias,
+             const bool fused_relu,
+             const int stride,
+             const DataType dt,
+             Tensor *output) {
   const index_t batch = output->dim(0);
-  const index_t channels = output->dim(1);
-  const index_t height = output->dim(2);
-  const index_t width = output->dim(3);
-  const index_t input_channels = input->dim(1);
-
-  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-  const index_t channel_blocks = (channels + 3) / 4;
-  const index_t pixel_blocks = (width + 3) / 4 * height;
-
-  // TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
-  // TODO check wired clReleaseCommandQueue latency
-  // The KernelFunctor can cause segment faults in cb_retain_event
-  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
-  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-  auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1_v2", built_options);
-
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
-  uint32_t idx = 0;
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Buffer *>(input->buffer())));
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Buffer *>(filter->buffer())));
-  if (bias != nullptr) {
-    conv_2d_kernel.setArg(idx++,
-                          *(static_cast<const cl::Buffer *>(bias->buffer())));
-  }
-  conv_2d_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
-
-  auto command_queue = runtime->command_queue();
-  cl_int error = command_queue.enqueueNDRangeKernel(
-      conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<int>(batch), static_cast<int>(channel_blocks),
-                  static_cast<int>(pixel_blocks)),
-      cl::NDRange(1, 2, kwg_size / 2),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS, error);
-}
-
-void Conv1x1V3(const Tensor *input,
-               const Tensor *filter,
-               const Tensor *bias,
-               const int stride,
-               Tensor *output) {
-  const index_t batch = output->dim(0);
-  const index_t channels = output->dim(1);
-  const index_t height = output->dim(2);
-  const index_t width = output->dim(3);
-  const index_t input_channels = input->dim(1);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+  const index_t input_batch = input->dim(0);
+  const index_t input_height = input->dim(1);
+  const index_t input_width = input->dim(2);
+  const index_t input_channels = input->dim(3);
 
   const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_blocks = RoundUpDiv4(width);
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
 
+  MACE_CHECK(input_batch == batch);
+
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
-  built_options.emplace("-DSTRIDE_1");
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
+  if (bias != nullptr) {
+    built_options.emplace("-DBIAS");
+  }
+  if (fused_relu) {
+    built_options.emplace("-DFUSED_RELU");
+  }
 
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
@@ -96,47 +57,42 @@ void Conv1x1V3(const Tensor *input,
     conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
   }
   conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_height));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_width));
   conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
   conv_2d_kernel.setArg(idx++, static_cast<int>(width));
 
   auto command_queue = runtime->command_queue();
   cl_int error;
   error = command_queue.enqueueNDRangeKernel(
       conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(height),
+      cl::NDRange(static_cast<uint32_t>(channel_blocks),
+                  static_cast<uint32_t>(width_blocks),
                   static_cast<uint32_t>(height * batch)),
-      cl::NDRange(4, 15, 8),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+      cl::NDRange(4, 15, 8), // TODO auto tuning
+      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS, error);
 }
 
 extern void Conv2dOpenclK1x1S1(const Tensor *input,
                                const Tensor *filter,
                                const Tensor *bias,
+                               const bool fused_relu,
                                const int *padding,
+                               const DataType dt,
                                Tensor *output) {
-  const index_t batch = output->dim(0);
-  const index_t height = output->dim(2);
-  const index_t width = output->dim(3);
-
-  const index_t input_batch = input->dim(0);
-  const index_t input_height = input->dim(2);
-  const index_t input_width = input->dim(3);
-
-  MACE_CHECK(input_batch == batch && input_height == height &&
-             input_width == width);
-
-  Conv1x1V2(input, filter, bias, 1, output);
+  Conv1x1(input, filter, bias, fused_relu, 1, dt, output);
 };
 
 extern void Conv2dOpenclK1x1S2(const Tensor *input,
                                const Tensor *filter,
                                const Tensor *bias,
+                               const bool fused_relu,
                                const int *padding,
+                               const DataType dt,
                                Tensor *output) {
-  MACE_CHECK(input->dim(0) == output->dim(0));
-
-  Conv1x1V2(input, filter, bias, 2, output);
+  Conv1x1(input, filter, bias, fused_relu, 2, dt, output);
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index b7e11e817922287a9b048ed9299c5d332f3ef0cf..7b7453ad53a4af2921cdea07f9a983b51865d848 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -12,8 +12,9 @@ namespace mace {
 namespace kernels {
 
 static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
-                         const Tensor *bias, const uint32_t stride,
-                         const int *padding, Tensor *output) {
+                         const Tensor *bias, const bool fused_relu,
+                         const uint32_t stride, const int *padding,
+                         const DataType dt, Tensor *output) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -22,18 +23,21 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
 
   const index_t channel_blocks = RoundUpDiv4(channels);
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
-  const index_t width_blocks = RoundUpDiv4(width);
+  const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
 
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
   built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
+  if (fused_relu) {
+    built_options.emplace("-DFUSED_RELU");
+  }
 
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
 
   auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
 
   uint32_t idx = 0;
   conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
@@ -44,7 +48,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
   conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
   conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
   conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
   conv_2d_kernel.setArg(idx++, static_cast<int>(height));
   conv_2d_kernel.setArg(idx++, static_cast<int>(width));
   conv_2d_kernel.setArg(idx++, padding[0] / 2);
@@ -56,18 +60,29 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
       conv_2d_kernel, cl::NullRange,
       cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
                   static_cast<uint32_t>(height * batch)),
-      cl::NDRange(4, 15, 8),
+      cl::NDRange(16, 16, 4),
       NULL, OpenCLRuntime::Get()->GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS, error);
 
 }
-void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
-                        const Tensor *bias, const int *padding, Tensor *output) {
-  Conv2d3x3S12(input, filter, bias, 1, padding, output);
+void Conv2dOpenclK3x3S1(const Tensor *input,
+                        const Tensor *filter,
+                        const Tensor *bias,
+                        const bool fused_relu,
+                        const int *padding,
+                        const DataType dt,
+                        Tensor *output) {
+  Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output);
 };
 
-void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
-                        const Tensor *bias, const int *padding, Tensor *output) {
+void Conv2dOpenclK3x3S2(const Tensor *input,
+                        const Tensor *filter,
+                        const Tensor *bias,
+                        const bool fused_relu,
+                        const int *padding,
+                        const DataType dt,
+                        Tensor *output) {
+  Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output);
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e46ecbcaca06e811de44b5a29e08abb1e3418906
--- /dev/null
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -0,0 +1,73 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/core/common.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+namespace kernels {
+
+void Conv2dOpencl(const Tensor *input, const Tensor *filter,
+                  const Tensor *bias, const bool fused_relu,
+                  const uint32_t stride, const int *padding,
+                  const DataType dt, Tensor *output) {
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+  const index_t input_channels = input->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
+  const index_t width_blocks = RoundUpDiv4(width);
+
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
+  if (fused_relu) {
+    built_options.emplace("-DFUSED_RELU");
+  }
+
+  auto runtime = OpenCLRuntime::Get();
+  auto program = runtime->program();
+
+  auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options);
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
+
+  uint32_t idx = 0;
+  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
+  if (bias != nullptr) {
+    conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+  }
+  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(0)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(1)));
+  conv_2d_kernel.setArg(idx++, padding[0] / 2);
+  conv_2d_kernel.setArg(idx++, padding[1] / 2);
+
+  auto command_queue = runtime->command_queue();
+  cl_int error;
+  error = command_queue.enqueueNDRangeKernel(
+      conv_2d_kernel, cl::NullRange,
+      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
+                  static_cast<uint32_t>(height * batch)),
+      cl::NDRange(16, 16, 4),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+  MACE_CHECK(error == CL_SUCCESS, error);
+
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
index 60ce2a829a78a0a0439dd1e287c61f2dee4b490b..1402131df164cb0d1ba348617b3988e78f71c574 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -32,7 +32,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
   built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
   built_options.emplace(bias != nullptr ? "-DBIAS" : "");
   auto conv_kernel  = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options);
diff --git a/mace/kernels/opencl/fused_conv_2d_opencl.cc b/mace/kernels/opencl/fused_conv_2d_opencl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e75cb9d7e369b57eb9caa0125a42f6d8b539c50
--- /dev/null
+++ b/mace/kernels/opencl/fused_conv_2d_opencl.cc
@@ -0,0 +1,87 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/kernels/fused_conv_2d.h"
+#include "mace/kernels/opencl/helper.h"
+
+namespace mace {
+namespace kernels {
+
+extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
+                               Tensor *output);
+
+extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
+                               Tensor *output);
+
+extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
+                               Tensor *output);
+
+extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const bool fused_relu,
+                               const int *padding, const DataType dt,
+                               Tensor *output);
+
+template<typename T>
+void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+                                                           const Tensor *filter,
+                                                           const Tensor *bias,
+                                                           Tensor *output) {
+  typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
+                                       const Tensor *bias, const bool fused_relu,
+                                       const int *padding, const DataType dt,
+                                       Tensor *output);
+  // Selection matrix: kernel_size x stride_size
+  static const Conv2dOpenclFunction selector[5][2] = {
+      {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
+      {nullptr, nullptr},
+      {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
+      {nullptr, nullptr},
+      {nullptr, nullptr}};
+
+  index_t kernel_h = filter->dim(0);
+  index_t kernel_w = filter->dim(1);
+  if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
+      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
+      selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
+    LOG(WARNING) << "OpenCL conv2d kernel with "
+                 << "filter" << kernel_h << "x" << kernel_w << ","
+                 << " stride " << strides_[0] << "x" << strides_[1]
+                 << " is not implemented yet, using slow version";
+    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
+    FusedConv2dFunctor<DeviceType::CPU, T>(strides_, paddings_, dilations_)(
+        input, filter, bias, output);
+    return;
+  }
+
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  kernels::CalcNHWCPaddingAndOutputSize(
+      input->shape().data(), filter->shape().data(), dilations_,
+      strides_, paddings_, output_shape.data(), paddings.data());
+
+  if (input->is_image()) {
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+    output->ResizeImage(output_shape, output_image_shape);
+  } else {
+    output->Resize(output_shape);
+  }
+
+  auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
+  conv2d_func(input, filter, bias, true, paddings.data(), DataTypeToEnum<T>::value, output);
+}
+
+template
+struct FusedConv2dFunctor<DeviceType::OPENCL, float>;
+template
+struct FusedConv2dFunctor<DeviceType::OPENCL, half>;
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 05221e55dedde3c7cc17d3f99d2818491d930b87..2c1dc264bd5ac1ddaeeaf47ea54a6e8b9e32e13a 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -54,35 +54,19 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
 }
 
 
-std::string DataTypeToCLType(const DataType dt) {
+std::string DtToCLDt(const DataType dt) {
   switch (dt) {
     case DT_FLOAT:
       return "float";
     case DT_HALF:
       return "half";
-    case DT_UINT8:
-      return "uchar";
-    case DT_INT8:
-      return "char";
-    case DT_DOUBLE:
-      return "double";
-    case DT_INT32:
-      return "int";
-    case DT_UINT32:
-      return "int";
-    case DT_UINT16:
-      return "ushort";
-    case DT_INT16:
-      return "short";
-    case DT_INT64:
-      return "long";
     default:
       LOG(FATAL) << "Unsupported data type";
       return "";
   }
 }
 
-std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
+std::string DtToCLCMDDt(const DataType dt) {
   switch (dt) {
     case DT_FLOAT:
       return "f";
@@ -94,5 +78,27 @@ std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
   }
 }
 
+std::string DtToUpstreamCLDt(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+    case DT_HALF:
+      return "float";
+    default:
+      LOG(FATAL) << "Unsupported data type";
+      return "";
+  }
+}
+
+std::string DtToUpstreamCLCMDDt(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+    case DT_HALF:
+      return "f";
+    default:
+      LOG(FATAL) << "Not supported data type for opencl cmd data type";
+      return "";
+  }
+}
+
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 1ad94aa5d2545f059ec785c0b4ec36a87155fb49..70d74e5886c61a50c0a5fb684d02ecc6e00403cd 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -19,10 +19,13 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                      const BufferType type,
                      std::vector<size_t> &image_shape);
 
-std::string DataTypeToOPENCLCMDDataType(const DataType dt);
+std::string DtToCLCMDDt(const DataType dt);
 
-std::string DataTypeToCLType(const DataType dt);
+std::string DtToUpstreamCLCMDDt(const DataType dt);
 
+std::string DtToCLDt(const DataType dt);
+
+std::string DtToUpstreamCLDt(const DataType dt);
 
 }  // namespace kernels
 } //  namespace mace
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 0aaa89ae2c649583dddafaffbcce428d4ffc94fd..349c619574e425aea00b4521194c3ae04649942f 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -10,131 +10,94 @@
 namespace mace {
 namespace kernels {
 
-static void Pooling3(const Tensor *input,
-                     const int *stride,
-                     const PoolingType type,
-                     Tensor *output) {
-  if (type != MAX) {
-    MACE_NOT_IMPLEMENTED;
-  }
+static void Pooling(const Tensor *input,
+                    const int *stride,
+                    const int *paddings,
+                    const int pooling_size,
+                    const PoolingType type,
+                    const DataType dt,
+                    Tensor *output) {
   index_t batch = output->dim(0);
-  index_t channels = output->dim(1);
-  index_t out_height = output->dim(2);
-  index_t out_width = output->dim(3);
+  index_t out_height = output->dim(1);
+  index_t out_width = output->dim(2);
+  index_t channels = output->dim(3);
 
-  index_t channel_blk = (channels + 3) / 4;
-  const index_t pixel_width = (out_width + 3) / 4 ;
+  index_t channel_blocks = (channels + 3) / 4;
   const uint32_t gws[3] = {
-      static_cast<uint32_t>(batch),
-      static_cast<uint32_t>(channel_blk),
-      static_cast<uint32_t>(pixel_width * out_height),
+      static_cast<uint32_t>(channel_blocks),
+      static_cast<uint32_t>(out_width),
+      static_cast<uint32_t>(batch * out_height),
   };
 
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
-  built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : "");
-  auto pooling_kernel  = runtime->BuildKernel("pooling", "pooling3", built_options);
+  if (type == MAX && input->dtype() == output->dtype()) {
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
+  } else {
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  }
+  if (type == AVG) {
+    built_options.emplace("-DPOOL_AVG");
+  }
+  auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options);
 
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
 
-  const uint32_t lws[3] = {1, 8, 128};
+  uint32_t lws[3];
+  lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+  lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
+  lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
 
   uint32_t idx = 0;
-  pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
   pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
   pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
+  pooling_kernel.setArg(idx++, paddings[0] / 2);
+  pooling_kernel.setArg(idx++, paddings[1] / 2);
   pooling_kernel.setArg(idx++, stride[0]);
-  pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  pooling_kernel.setArg(idx++, pooling_size);
+  pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
 
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
       pooling_kernel, cl::NullRange,
       cl::NDRange(gws[0], gws[1], gws[2]),
       cl::NDRange(lws[0], lws[1], lws[2]),
       NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
+  MACE_CHECK(error == CL_SUCCESS) << error;
 }
 
-static void PoolingN(const Tensor *input,
-                     const int *stride,
-                     const int *paddings,
-                     const int pooling_size,
-                     const PoolingType type,
-                     Tensor *output) {
-  if (type != AVG) {
-    MACE_NOT_IMPLEMENTED;
-  }
-  index_t batch = output->dim(0);
-  index_t channels = output->dim(1);
-  index_t out_height = output->dim(2);
-  index_t out_width = output->dim(3);
-
-  index_t channel_blk = (channels + 3) / 4;
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(batch),
-      static_cast<uint32_t>(channel_blk),
-      static_cast<uint32_t>(out_height * out_width),
+template<typename T>
+void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+                                                       Tensor *output) {
+  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet";
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  std::vector<index_t> filter_shape = {
+      kernels_[0], kernels_[1],
+      input->dim(3), input->dim(3)
   };
 
-  auto runtime = OpenCLRuntime::Get();
-  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
-  auto pooling_kernel  = runtime->BuildKernel("pooling", "poolingn", built_options);
+  kernels::CalcNHWCPaddingAndOutputSize(
+      input->shape().data(), filter_shape.data(),
+      dilations_, strides_, this->padding_,
+      output_shape.data(), paddings.data());
 
-  const uint32_t lws[3] = {1, 8, 128};
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+  output->ResizeImage(output_shape, output_image_shape);
 
-  uint32_t idx = 0;
-  pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
-  pooling_kernel.setArg(idx++, stride[0]);
-  pooling_kernel.setArg(idx++, paddings[0]);
-  pooling_kernel.setArg(idx++, paddings[1]);
-  pooling_kernel.setArg(idx++, pooling_size);
-  pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
+          DataTypeToEnum<T>::value, output);
 
-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      pooling_kernel, cl::NullRange,
-      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
-}
-
-template <>
-void PoolingFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
-                                                          Tensor *output) {
-  int paddings[2];
-  std::vector<index_t> filter_shape = {input->dim(1), input->dim(0),
-                                       kernels_[0], kernels_[1]};
-  kernels::CalPaddingSize(input->shape().data(), filter_shape.data(), this->dilations_,
-                          strides_, this->padding_, paddings);
-#define POOLING_HELPER                                               \
-  switch(kernels_[0]) {                                              \
-    case 3:                                                          \
-      Pooling3(input, strides_, pooling_type_, output);              \
-      break;                                                         \
-    default:                                                         \
-      PoolingN(input, strides_, paddings, kernels_[0],               \
-               pooling_type_, output);                               \
-      break;                                                         \
-  }
-
-  if (paddings[0] > 0 || paddings[1] > 0) {
-    Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
-    ConstructInputWithPadding(input, paddings, &padded_input, pooling_type_ == MAX);
-    input = &padded_input;
-    POOLING_HELPER
-  } else {
-    POOLING_HELPER
-  }
-#undef POOLING_HELPER
 }
 
+template
+struct PoolingFunctor<DeviceType::OPENCL, float>;
+template
+struct PoolingFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index 7b77afea0fdd3aed146b22d736cacc5c6c165e79..27dd8e62b96422c368e324d249900b5e8d5f7767 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -6,24 +6,33 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/resize_bilinear.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 namespace kernels {
 
-template <>
-void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
+template <typename T>
+void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     const Tensor *input, const Tensor *resize_dims, Tensor *output) {
   const index_t batch = input->dim(0);
-  const index_t channels = input->dim(1);
-  const index_t in_height = input->dim(2);
-  const index_t in_width = input->dim(3);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
 
   index_t out_height;
   index_t out_width;
   GetOutputSize(resize_dims, &out_height, &out_width);
   MACE_CHECK(out_height > 0 && out_width > 0);
-  std::vector<index_t> out_shape {batch, channels, out_height, out_width};
-  output->Resize(out_shape);
+  std::vector<index_t> output_shape {batch, out_height, out_width, channels};
+  if (input->is_image()) {
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+    output->ResizeImage(output_shape, output_image_shape);
+  } else {
+    output->Resize(output_shape);
+  }
 
   float height_scale =
       CalculateResizeScale(in_height, out_height, align_corners_);
@@ -31,29 +40,37 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
 
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto dt = DataTypeToEnum<T>::value;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
   auto rb_kernel  = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);
 
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
+
   uint32_t idx = 0;
-  rb_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-  rb_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+  rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
   rb_kernel.setArg(idx++, height_scale);
   rb_kernel.setArg(idx++, width_scale);
-  rb_kernel.setArg(idx++, static_cast<int>(in_height));
-  rb_kernel.setArg(idx++, static_cast<int>(in_width));
+  rb_kernel.setArg(idx++, static_cast<int32_t>(in_height));
+  rb_kernel.setArg(idx++, static_cast<int32_t>(in_width));
+  rb_kernel.setArg(idx++, static_cast<int32_t>(out_height));
 
   auto command_queue = runtime->command_queue();
 
   cl_int error = command_queue.enqueueNDRangeKernel(
       rb_kernel, cl::NullRange,
-      cl::NDRange(static_cast<int>(batch * channels),
-                  static_cast<int>(out_height), static_cast<int>(out_width)),
-      // TODO (heliangliang) tuning and fix when kwg_size < devisor
-      cl::NDRange(1, 16, kwg_size / 16),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+      cl::NDRange(static_cast<int32_t>(channel_blocks),
+                  static_cast<int32_t>(out_width),
+                  static_cast<int32_t>(out_height * batch)),
+      // TODO tuning
+      cl::NDRange(1, static_cast<int32_t>(out_width > kwg_size ? kwg_size : out_width), 1),
+      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS, error);
 }
 
+template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
+template struct ResizeBilinearFunctor<DeviceType::OPENCL, half>;
+
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/space_to_batch_opecl.cc b/mace/kernels/opencl/space_to_batch_opecl.cc
index 2716501c880fcd4fb2232e292b9396e27cfff2f3..72590be5e87ca1c5b721972855b8869e397df82c 100644
--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opecl.cc
@@ -20,7 +20,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
                                                                 Tensor *batch_tensor) {
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(space_tensor->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(space_tensor->dtype()));
   auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);
 
   uint32_t idx = 0;
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 11c05e47c5eb0faef5b3febe3ee63c4d1864d5c6..0a1960a4e7e891d6c71d1841cf672b3a48a83fdb 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -18,36 +18,66 @@ enum PoolingType {
 
 namespace kernels {
 
-template <DeviceType D, typename T>
-struct PoolingFunctor {
-  PoolingFunctor(const PoolingType pooling_type,
-                 const int *kernels,
-                 const int *strides,
-                 const Padding padding,
-                 const int *dilations)
+struct PoolingFunctorBase {
+  PoolingFunctorBase(const PoolingType pooling_type,
+                     const int *kernels,
+                     const int *strides,
+                     const Padding padding,
+                     const int *dilations)
       : pooling_type_(pooling_type),
         kernels_(kernels),
         strides_(strides),
         padding_(padding),
         dilations_(dilations) {}
 
+  const PoolingType pooling_type_;
+  const int *kernels_;
+  const int *strides_;
+  const Padding padding_;
+  const int *dilations_;
+};
+
+template<DeviceType D, typename T>
+struct PoolingFunctor : PoolingFunctorBase {
+  PoolingFunctor(const PoolingType pooling_type,
+                 const int *kernels,
+                 const int *strides,
+                 const Padding padding,
+                 const int *dilations)
+      : PoolingFunctorBase(pooling_type, kernels,
+                           strides, padding,
+                           dilations) {}
+
   void operator()(const Tensor *input_tensor,
                   Tensor *output_tensor) {
+
+    std::vector<index_t> output_shape(4);
+    std::vector<int> paddings(2);
+    std::vector<index_t> filter_shape = {
+        kernels_[0], kernels_[1],
+        input_tensor->dim(3), input_tensor->dim(3)
+    };
+
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input_tensor->shape().data(), filter_shape.data(),
+        dilations_, strides_, this->padding_,
+        output_shape.data(), paddings.data());
+    output_tensor->Resize(output_shape);
+
     Tensor::MappingGuard in_guard(input_tensor);
     Tensor::MappingGuard out_guard(output_tensor);
     const T *input = input_tensor->data<T>();
     T *output = output_tensor->mutable_data<T>();
     const index_t *input_shape = input_tensor->shape().data();
-    const index_t *output_shape = output_tensor->shape().data();
     index_t batch = output_shape[0];
-    index_t channels = output_shape[1];
-    index_t height = output_shape[2];
-    index_t width = output_shape[3];
+    index_t height = output_shape[1];
+    index_t width = output_shape[2];
+    index_t channels = output_shape[3];
     index_t out_image_size = height * width;
 
-    index_t input_channels = input_shape[1];
-    index_t input_height = input_shape[2];
-    index_t input_width = input_shape[3];
+    index_t input_height = input_shape[1];
+    index_t input_width = input_shape[2];
+    index_t input_channels = input_shape[3];
     index_t in_image_size = input_height * input_width;
 
     int kernel_h = kernels_[0];
@@ -59,11 +89,6 @@ struct PoolingFunctor {
     int dilation_h = dilations_[0];
     int dilation_w = dilations_[1];
 
-    int paddings[2];
-    std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
-                                         kernels_[0], kernels_[1]};
-    kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
-                            strides_, this->padding_, paddings);
     // The left-upper most offset of the padded input
     int padded_h_start = 0 - paddings[0] / 2;
     int padded_w_start = 0 - paddings[1] / 2;
@@ -71,25 +96,24 @@ struct PoolingFunctor {
     if (pooling_type_ == MAX) {
 #pragma omp parallel for collapse(2)
       for (int b = 0; b < batch; ++b) {
-        for (int c = 0; c < channels; ++c) {
-          index_t out_offset = (b * channels + c) * out_image_size;
-          index_t in_offset = (b * input_channels + c) * in_image_size;
-          for (int h = 0; h < height; ++h) {
-            for (int w = 0; w < width; ++w) {
-              T max = std::numeric_limits<T>::lowest();
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            for (int c = 0; c < channels; ++c) {
+              index_t in_offset = b * in_image_size * input_channels + c;
+              T res = std::numeric_limits<T>::lowest();
               for (int kh = 0; kh < kernel_h; ++kh) {
                 for (int kw = 0; kw < kernel_w; ++kw) {
                   int inh = padded_h_start + h * stride_h + dilation_h * kh;
                   int inw = padded_w_start + w * stride_w + dilation_w * kw;
                   if (inh >= 0 && inh < input_height && inw >= 0 &&
                       inw < input_width) {
-                    index_t input_offset = in_offset + inh * input_width + inw;
-                    max = std::max(max, input[input_offset]);
+                    index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
+                    res = std::max(res, input[input_offset]);
                   }
                 }
               }
-              output[out_offset] = max;
-              out_offset += 1;
+              *output = res;
+              output++;
             }
           }
         }
@@ -97,11 +121,10 @@ struct PoolingFunctor {
     } else if (pooling_type_ == AVG) {
 #pragma omp parallel for collapse(2)
       for (int b = 0; b < batch; ++b) {
-        for (int c = 0; c < channels; ++c) {
-          index_t out_offset = (b * channels + c) * out_image_size;
-          index_t in_offset = (b * input_channels + c) * in_image_size;
-          for (int h = 0; h < height; ++h) {
-            for (int w = 0; w < width; ++w) {
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            for (int c = 0; c < channels; ++c) {
+              index_t in_offset = b * in_image_size * input_channels + c;
               T sum = 0;
               int block_size = 0;
               for (int kh = 0; kh < kernel_h; ++kh) {
@@ -110,14 +133,14 @@ struct PoolingFunctor {
                   int inw = padded_w_start + w * stride_w + dilation_w * kw;
                   if (inh >= 0 && inh < input_height && inw >= 0 &&
                       inw < input_width) {
-                    index_t input_offset = in_offset + inh * input_width + inw;
+                    index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
                     sum += input[input_offset];
                     block_size += 1;
                   }
                 }
               }
-              output[out_offset] = sum / block_size;
-              out_offset += 1;
+              *output = sum / block_size;
+              output++;
             }
           }
         }
@@ -125,22 +148,26 @@ struct PoolingFunctor {
     }
   }
 
-  const PoolingType pooling_type_;
-  const int *kernels_;
-  const int *strides_;
-  const Padding padding_;
-  const int *dilations_;
 };
 
-template <>
+template<>
 void PoolingFunctor<DeviceType::NEON, float>::operator()(
     const Tensor *input_tensor,
     Tensor *output_tensor);
 
-template <>
-void PoolingFunctor<DeviceType::OPENCL, float>::operator()(
-    const Tensor *input_tensor,
-    Tensor *output_tensor);
+template<typename T>
+struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
+  PoolingFunctor(const PoolingType pooling_type,
+                 const int *kernels,
+                 const int *strides,
+                 const Padding padding,
+                 const int *dilations)
+      : PoolingFunctorBase(pooling_type, kernels,
+                           strides, padding,
+                           dilations) {}
+  void operator()(const Tensor *input_tensor,
+                  Tensor *output_tensor);
+};
 
 }  //  namespace kernels
 }  //  namespace mace
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index 59bb2505c9c379c1b0700d7a515a880a704d72db..27415ebdd8e61ff904360d1c520aab8ecf2b7591 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -61,63 +61,90 @@ void ResizeImage(const T *images,
                  const index_t channels,
                  const std::vector<CachedInterpolation> &xs_vec,
                  const std::vector<CachedInterpolation> &ys,
-                 float *output) {
-  const index_t in_channel_size = in_height * in_width;
-  const index_t in_batch_num_values = channels * in_channel_size;
-  const index_t out_channel_size = out_height * out_width;
-  const index_t out_batch_num_values = channels * out_channel_size;
+                 T *output) {
+  const index_t in_batch_num_values = channels * in_height * in_width;
+  const index_t out_batch_num_values = channels * out_height * out_width;
   const CachedInterpolation *xs = xs_vec.data();
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for
   for (index_t b = 0; b < batch_size; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      const T *input_ptr =
-          images + in_batch_num_values * b + in_channel_size * c;
-      float *output_ptr =
-          output + out_batch_num_values * b + out_channel_size * c;
-      for (index_t y = 0; y < out_height; ++y) {
-        const T *ys_input_lower_ptr = input_ptr + ys[y].lower * in_width;
-        const T *ys_input_upper_ptr = input_ptr + ys[y].upper * in_width;
-        const float ys_lerp = ys[y].lerp;
-        for (index_t x = 0; x < out_width; ++x) {
-          auto xs_lower = xs[x].lower;
-          auto xs_upper = xs[x].upper;
-          auto xs_lerp = xs[x].lerp;
-
-          const float top_left = ys_input_lower_ptr[xs_lower];
-          const float top_right = ys_input_lower_ptr[xs_upper];
-          const float bottom_left = ys_input_upper_ptr[xs_lower];
-          const float bottom_right = ys_input_upper_ptr[xs_upper];
-
-          output_ptr[x] = ComputeLerp(top_left, top_right, bottom_left,
-                                      bottom_right, xs_lerp, ys_lerp);
+    const T *batch_input_ptr = images + in_batch_num_values * b;;
+    T *batch_output_ptr = output + out_batch_num_values * b;
+
+    for (index_t y = 0; y < out_height; ++y) {
+      const T *y_lower_input_ptr =
+        batch_input_ptr + ys[y].lower * in_width * channels;
+      const T *y_upper_input_ptr =
+        batch_input_ptr + ys[y].upper * in_width * channels;
+      T *y_output_ptr = batch_output_ptr + y * out_width * channels;
+      const float ys_lerp = ys[y].lerp;
+
+      for (index_t x = 0; x < out_width; ++x) {
+        const float xs_lerp = xs[x].lerp;
+        const T *top_left_ptr = y_lower_input_ptr + xs[x].lower * channels;
+        const T *top_right_ptr = y_lower_input_ptr + xs[x].upper * channels;
+        const T *bottom_left_ptr = y_upper_input_ptr + xs[x].lower * channels;
+        const T *bottom_right_ptr = y_upper_input_ptr + xs[x].upper * channels;
+        T *output_ptr = y_output_ptr + x * channels;
+
+        for (index_t c = 0; c < channels; ++c) {
+          const T top_left = top_left_ptr[c];
+          const T top_right = top_right_ptr[c];
+          const T bottom_left = bottom_left_ptr[c];
+          const T bottom_right = bottom_right_ptr[c];
+
+          output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left,
+              bottom_right, xs_lerp, ys_lerp);
         }
-        output_ptr += out_width;
       }
     }
   }
 }
 }
 
+struct ResizeBilinearFunctorBase {
+  ResizeBilinearFunctorBase(const std::vector<index_t> &size,
+                            bool align_corners)
+      : align_corners_(align_corners), size_(size) {}
+
+ protected:
+  void GetOutputSize(const Tensor *resize_dims,
+                     index_t *out_height,
+                     index_t *out_width) {
+    if (size_[0] < 0 || size_[1] < 0) {
+      MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1);
+      Tensor::MappingGuard resize_dims_mapper(resize_dims);
+      auto dims_data = resize_dims->data<int32_t>();
+      *out_height = dims_data[0];
+      *out_width = dims_data[1];
+    } else {
+      *out_height = size_[0];
+      *out_width = size_[1];
+    }
+  }
+
+  bool align_corners_;
+  std::vector<index_t> size_;
+};
+
 template <DeviceType D, typename T>
-class ResizeBilinearFunctor {
- public:
+struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
   ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
-      : align_corners_(align_corners), size_(size) {}
+      : ResizeBilinearFunctorBase(size, align_corners) {}
 
   void operator()(const Tensor *input,
                   const Tensor *resize_dims,
                   Tensor *output) {
     const index_t batch = input->dim(0);
-    const index_t channels = input->dim(1);
-    const index_t in_height = input->dim(2);
-    const index_t in_width = input->dim(3);
+    const index_t in_height = input->dim(1);
+    const index_t in_width = input->dim(2);
+    const index_t channels = input->dim(3);
 
     index_t out_height;
     index_t out_width;
     GetOutputSize(resize_dims, &out_height, &out_width);
     MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> out_shape{batch, channels, out_height, out_width};
+    std::vector<index_t> out_shape{batch, out_height, out_width, channels};
     output->Resize(out_shape);
 
     Tensor::MappingGuard input_mapper(input);
@@ -146,32 +173,18 @@ class ResizeBilinearFunctor {
     ResizeImage(input_data, batch, in_height, in_width, out_height, out_width,
                 channels, xs, ys, output_data);
   }
+};
 
- protected:
-  void GetOutputSize(const Tensor *resize_dims,
-                     index_t *out_height,
-                     index_t *out_width) {
-    if (size_[0] < 0 || size_[1] < 0) {
-      MACE_CHECK(resize_dims != nullptr && resize_dims->dim_size() == 1);
-      Tensor::MappingGuard resize_dims_mapper(resize_dims);
-      auto dims_data = resize_dims->data<int32_t>();
-      *out_height = dims_data[0];
-      *out_width = dims_data[1];
-    } else {
-      *out_height = size_[0];
-      *out_width = size_[1];
-    }
-  }
+template<typename T>
+struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase {
+  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
+      : ResizeBilinearFunctorBase(size, align_corners) {}
 
- private:
-  bool align_corners_;
-  std::vector<index_t> size_;
+  void operator()(const Tensor *input,
+                  const Tensor *resize_dims,
+                  Tensor *output);
 };
 
-template <>
-void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
-    const Tensor *input, const Tensor *resize_dims, Tensor *output);
-
 }  // namespace kernels
 }  // namespace mace
 
diff --git a/mace/mace.bzl b/mace/mace.bzl
index f9e7b6afc50d2908eef34292f522a0f3c4946c75..757334a8b8c0d5b104afd19bd9654ddec24b3eeb 100644
--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -22,4 +22,10 @@ def if_android_arm64(a):
   return select({
       "//mace:android_arm64": a,
       "//conditions:default": [],
-  })
\ No newline at end of file
+  })
+
+def if_profiling(a):
+  return select({
+      "//mace:is_profiling": a,
+      "//conditions:default": [],
+  })
diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc
index b4b74b04b84d01ac4f6941c649acabc04f25c0d8..ba0bb38019fbfc6274d09dfa81d9efd8e83ed789 100644
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -6,12 +6,26 @@
 
 namespace mace {
 
-REGISTER_CPU_OPERATOR(AddN, AddNOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      AddNOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(AddN, AddNOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       AddNOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 
-REGISTER_OPENCL_OPERATOR(AddN, AddNOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         AddNOp<DeviceType::OPENCL, float>);
+
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         AddNOp<DeviceType::OPENCL, half>);
 
 }  //  namespace mace
diff --git a/mace/ops/addn.h b/mace/ops/addn.h
index a2ffefbbc54e846317415e653078706a2938f67b..155c6830b6aa14e072e3ba67f68ee6421aa427c1 100644
--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -10,7 +10,7 @@
 
 namespace mace {
 
-template<DeviceType D, class T>
+template <DeviceType D, class T>
 class AddNOp : public Operator<D, T> {
  public:
   AddNOp(const OperatorDef &operator_def, Workspace *ws)
@@ -18,7 +18,6 @@ class AddNOp : public Operator<D, T> {
 
   bool Run() override {
     Tensor *output_tensor = this->outputs_[0];
-    output_tensor->ResizeLike(this->inputs_[0]);
     int n = this->inputs_.size();
     vector<const Tensor *> inputs(n, nullptr);
     for (int i = 0; i < n; ++i) {
diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc
index ad48f4458e570f826b8d9caaf5c75f45d74dbaa1..717be1ea886e933a29b151276f6c653c2177cb3c 100644
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -9,47 +9,69 @@
 
 namespace mace {
 template <DeviceType D, typename T>
-static void AddNBenchmark(int iters, int n, int size) {
+static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
-  OpDefBuilder op_def_builder("AddN", "AddNBM");
-  for (int i = 0; i < n; ++i) {
-    op_def_builder.Input(internal::MakeString("Input", i).c_str());
+  // Add input data
+  for (int i = 0; i < inputs; ++i) {
+    net.AddRandomInput<D, float>(
+        internal::MakeString("Input", i).c_str(), {n, h, w, c});
   }
-  op_def_builder.Output("Output").Finalize(net.NewOperatorDef());
 
-  // Add input data
-  for (int i = 0; i < n; ++i) {
-    net.AddRandomInput<DeviceType::CPU, float>(internal::MakeString("Input", i).c_str(), {size});
+  if (D == DeviceType::OPENCL) {
+    for (int i = 0; i < inputs; ++i) {
+      BufferToImage<D, T>(net, internal::MakeString("Input", i).c_str(),
+                          internal::MakeString("InputImage", i).c_str(),
+                          kernels::BufferType::IN_OUT);
+    }
+    OpDefBuilder op_def_builder("AddN", "AddNBM");
+    for (int i = 0; i < inputs; ++i) {
+      op_def_builder.Input(internal::MakeString("InputImage", i).c_str());
+    }
+    op_def_builder.Output("OutputImage")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder op_def_builder("AddN", "AddNBM");
+    for (int i = 0; i < inputs; ++i) {
+      op_def_builder.Input(internal::MakeString("Input", i).c_str());
+    }
+    op_def_builder.Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
   }
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
     net.RunOp(D);
+    net.Sync();
   }
 
   mace::testing::StartTiming();
   while (iters--) {
     net.RunOp(D);
+    net.Sync();
   }
 }
 
-#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE)                        \
-  static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE(int iters) { \
-    const int64_t tot = static_cast<int64_t>(iters) * N * SIZE;     \
-    mace::testing::ItemsProcessed(tot);                             \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));             \
-    AddNBenchmark<DEVICE, TYPE>(iters, N, SIZE);                    \
-  }                                                                 \
-  BENCHMARK(BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE)
-
-#define BM_ADDN(N, SIZE, TYPE)       \
-  BM_ADDN_MACRO(N, SIZE, TYPE, CPU); \
-  BM_ADDN_MACRO(N, SIZE, TYPE, NEON);
-
-BM_ADDN(10, 1000, float);
-BM_ADDN(10, 10000, float);
-BM_ADDN(100, 1000, float);
-BM_ADDN(100, 10000, float);
-}  //  namespace mace
\ No newline at end of file
+#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE)                     \
+  static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
+      int iters) {                                                          \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;        \
+    mace::testing::ItemsProcessed(tot);                                     \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                     \
+    AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C);                 \
+  }                                                                         \
+  BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
+
+#define BM_ADDN(INPUTS, N, H, W, C, TYPE)       \
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);
+
+BM_ADDN(2, 1, 240, 240, 256, float);
+// BM_ADDN(2, 1, 240, 240, 256, half);
+BM_ADDN(4, 1, 240, 240, 256, float);
+// BM_ADDN(4, 1, 240, 240, 256, half);
+
+}  //  namespace mace
diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc
index 3fc58011f623ebf5ff541c1ed2f48d2b9eb5a959..5f9bd2bfe7cce685eca883e6c2159312ca0dd41f 100644
--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -9,7 +9,7 @@ namespace mace {
 
 class AddnOpTest : public OpsTestBase {};
 
-template<DeviceType D>
+template <DeviceType D>
 void SimpleAdd2() {
   // Construct graph
   OpsTestNet net;
@@ -20,30 +20,26 @@ void SimpleAdd2() {
       .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
-  net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
 
   // Run
   net.RunOp(D);
 
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {2, 4, 6, 8, 10, 12});
+  auto expected = CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 
-TEST_F(AddnOpTest, CPUSimpleAdd2) {
-  SimpleAdd2<DeviceType::CPU>();
-}
+TEST_F(AddnOpTest, CPUSimpleAdd2) { SimpleAdd2<DeviceType::CPU>(); }
 
-TEST_F(AddnOpTest, NEONSimpleAdd2) {
-  SimpleAdd2<DeviceType::NEON>();
-}
+/*
+TEST_F(AddnOpTest, NEONSimpleAdd2) { SimpleAdd2<DeviceType::NEON>(); }
 
-TEST_F(AddnOpTest, OPENCLSimpleAdd2) {
-  SimpleAdd2<DeviceType::OPENCL>();
-}
+TEST_F(AddnOpTest, OPENCLSimpleAdd2) { SimpleAdd2<DeviceType::OPENCL>(); }
+*/
 
-template<DeviceType D>
+template <DeviceType D>
 void SimpleAdd3() {
   // Construct graph
   OpsTestNet net;
@@ -55,62 +51,80 @@ void SimpleAdd3() {
       .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
-  net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
-  net.AddInputFromArray<D, float>("Input3", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
 
   // Run
   net.RunOp(D);
 
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {3, 6, 9, 12, 15, 18});
+  auto expected = CreateTensor<float>({1, 2, 3, 1}, {3, 6, 9, 12, 15, 18});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 
-TEST_F(AddnOpTest, CPUSimpleAdd3) {
-  SimpleAdd3<DeviceType::CPU>();
-}
+TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
 
-TEST_F(AddnOpTest, NEONSimpleAdd3) {
-  SimpleAdd3<DeviceType::NEON>();
-}
+/*
+TEST_F(AddnOpTest, NEONSimpleAdd3) { SimpleAdd3<DeviceType::NEON>(); }
+*/
 
-template<DeviceType D>
+template <DeviceType D>
 void RandomTest() {
-  // Construct graph
-  OpsTestNet net;
-  OpDefBuilder("AddN", "AddNTest")
-      .Input("Input1")
-      .Input("Input2")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddRandomInput<D, float>("Input1", {1, 2, 3, 4});
-  net.AddRandomInput<D, float>("Input2", {1, 2, 3, 4});
-
-  // Check
-  net.RunOp(D);
-
-  Tensor result;
-  result.Copy(*net.GetOutput("Output"));
-
-  // Run
-  net.RunOp();
-
-  ExpectTensorNear<float>(*net.GetOutput("Output"), result, 1e-5);
-}
-
-TEST_F(AddnOpTest, CPURandom) {
-  RandomTest<DeviceType::CPU>();
+  testing::internal::LogToStderr();
+  srand(time(NULL));
+
+  for (int round = 0; round < 10; ++round) {
+    // generate random input
+    index_t n = 1 + (rand() % 5);
+    index_t h = 1 + (rand() % 100);
+    index_t w = 1 + (rand() % 100);
+    index_t c = 1 + (rand() % 32);
+    int input_num = 2 + rand() % 3;
+    // Construct graph
+    OpsTestNet net;
+    auto op_def = OpDefBuilder("AddN", "AddNTest");
+    for (int i = 0; i < input_num; ++i) {
+      op_def.Input("Input" + ToString(i));
+    }
+    op_def.Output("Output").Finalize(net.NewOperatorDef());
+
+    // Add input data
+    for (int i = 0; i < input_num; ++i) {
+      net.AddRandomInput<D, float>("Input" + ToString(i), {n, h, w, c});
+    }
+
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    for (int i = 0; i < input_num; ++i) {
+      BufferToImage<D, half>(net, "Input" + ToString(i),
+                             "InputImage" + ToString(i),
+                             kernels::BufferType::IN_OUT);
+    }
+
+    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
+    for (int i = 0; i < input_num; ++i) {
+      op_def_cl.Input("InputImage" + ToString(i));
+    }
+    op_def_cl.Output("OutputImage")
+        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+        .Finalize(net.NewOperatorDef());
+
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
+                            kernels::BufferType::IN_OUT);
+
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.1);
+  }
 }
 
-TEST_F(AddnOpTest, NEONRandom) {
-  RandomTest<DeviceType::NEON>();
-}
-
-TEST_F(AddnOpTest, OPENCLRandom) {
-  RandomTest<DeviceType::OPENCL>();
-}
+TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
 
 }  // namespace mace
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index 1ce9b1e090bbf171bbe3ff33c07512af12e94c80..76723b2dc2c369257b79fb66b8c472752253700d 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -6,12 +6,26 @@
 
 namespace mace {
 
-REGISTER_CPU_OPERATOR(BatchNorm, BatchNormOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("BatchNorm")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      BatchNormOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(BatchNorm, BatchNormOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("BatchNorm")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       BatchNormOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 
-REGISTER_OPENCL_OPERATOR(BatchNorm, BatchNormOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchNorm")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         BatchNormOp<DeviceType::OPENCL, float>);
 
-}  //  namespace mace
\ No newline at end of file
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchNorm")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         BatchNormOp<DeviceType::OPENCL, half>);
+
+}  //  namespace mace
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index e0d56173d20e89799e7c2f1a9df33a90dbca47bd..4b34de14a0b298dee564bbd1aeab3f1434b2ac4f 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -13,28 +13,45 @@ static void BatchNorm(
     int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
-  if ( D == OPENCL )
-    OpenCLRuntime::EnableProfiling();
-
   OpsTestNet net;
-  OpDefBuilder("BatchNorm", "BatchNormBM")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Input("Mean")
-      .Input("Var")
-      .Input("Epsilon")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
+  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
   net.AddRandomInput<D, T>("Scale", {channels});
   net.AddRandomInput<D, T>("Offset", {channels});
   net.AddRandomInput<D, T>("Mean", {channels});
   net.AddRandomInput<D, T>("Var", {channels}, true);
   net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
 
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("BatchNorm", "BatchNormBM")
+        .Input("InputImage")
+        .Input("ScaleImage")
+        .Input("OffsetImage")
+        .Input("MeanImage")
+        .Input("VarImage")
+        .Input("Epsilon")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  }
+  else {
+    OpDefBuilder("BatchNorm", "BatchNormBM")
+        .Input("Input")
+        .Input("Scale")
+        .Input("Offset")
+        .Input("Mean")
+        .Input("Var")
+        .Input("Epsilon")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  }
+
+
   // tuning
   setenv("MACE_TUNING", "1", 1);
   net.RunOp(D);
diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc
index e13df29c33aad74ea730d39696e9cfa66a3f0aac..73e386caab16bbaff893fb56553a5ba3c4d5bae0 100644
--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -11,20 +11,10 @@ class BatchNormOpTest : public OpsTestBase {};
 
 template <DeviceType D>
 void Simple() {
-  // Construct graph
   OpsTestNet net;
-  OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Input("Mean")
-      .Input("Var")
-      .Input("Epsilon")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddInputFromArray<D, float>("Input", {1, 1, 6, 2},
+  net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
                                {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
   net.AddInputFromArray<D, float>("Scale", {1}, {4.0f});
   net.AddInputFromArray<D, float>("Offset", {1}, {2.0});
@@ -32,12 +22,44 @@ void Simple() {
   net.AddInputFromArray<D, float>("Var", {1}, {11.67f});
   net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
 
-  // Run
-  net.RunOp(D);
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("BatchNorm", "BatchNormTest")
+        .Input("InputImage")
+        .Input("ScaleImage")
+        .Input("OffsetImage")
+        .Input("MeanImage")
+        .Input("VarImage")
+        .Input("Epsilon")
+        .Output("OutputImage")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("BatchNorm", "BatchNormTest")
+        .Input("Input")
+        .Input("Scale")
+        .Input("Offset")
+        .Input("Mean")
+        .Input("Var")
+        .Input("Epsilon")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
 
   // Check
   auto expected =
-      CreateTensor<float>({1, 1, 6, 2}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
+      CreateTensor<float>({1, 6, 2, 1}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
                                          3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
@@ -47,14 +69,17 @@ TEST_F(BatchNormOpTest, SimpleCPU) {
   Simple<DeviceType::CPU>();
 }
 
+/*
 TEST_F(BatchNormOpTest, SimpleNEON) {
   Simple<DeviceType::NEON>();
 }
+*/
 
 TEST_F(BatchNormOpTest, SimpleOPENCL) {
   Simple<DeviceType::OPENCL>();
 }
 
+/*
 TEST_F(BatchNormOpTest, SimpleRandomNeon) {
   srand(time(NULL));
 
@@ -136,6 +161,7 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) {
 
   ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
 }
+*/
 
 TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
   srand(time(NULL));
@@ -145,6 +171,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
   index_t channels = 3 + rand() % 50;
   index_t height = 64;
   index_t width = 64;
+
   // Construct graph
   auto &net = test_net();
   OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -158,29 +185,48 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
       .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, channels, height, width});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, height, width, channels});
   net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
   net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
   net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
   net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
   net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
 
-  // tuning
+  // run cpu
+  net.RunOp();
+
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // Run on opencl
+  BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
+
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+      .Input("InputImage")
+      .Input("ScaleImage")
+      .Input("OffsetImage")
+      .Input("MeanImage")
+      .Input("VarImage")
+      .Input("Epsilon")
+      .Output("OutputImage")
+      .Finalize(net.NewOperatorDef());
+
+  // Tuning
   setenv("MACE_TUNING", "1", 1);
   net.RunOp(DeviceType::OPENCL);
   unsetenv("MACE_TUNING");
 
   // Run on opencl
   net.RunOp(DeviceType::OPENCL);
+  net.Sync();
 
-  // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
-
-  // run cpu
-  net.RunOp();
-
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
+  ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
 }
 
 TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
@@ -191,6 +237,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
   index_t channels = 3 + rand() % 50;
   index_t height = 103;
   index_t width = 113;
+
   // Construct graph
   auto &net = test_net();
   OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -204,13 +251,38 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
       .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, channels, height, width});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, height, width, channels});
   net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
   net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
   net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
   net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
   net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
 
+  // run cpu
+  net.RunOp();
+
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+
+  // Run on opencl
+  BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
+
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+      .Input("InputImage")
+      .Input("ScaleImage")
+      .Input("OffsetImage")
+      .Input("MeanImage")
+      .Input("VarImage")
+      .Input("Epsilon")
+      .Output("OutputImage")
+      .Finalize(net.NewOperatorDef());
+
   // tuning
   setenv("MACE_TUNING", "1", 1);
   net.RunOp(DeviceType::OPENCL);
@@ -220,14 +292,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
   net.RunOp(DeviceType::OPENCL);
   net.Sync();
 
-  // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
-
-  // run cpu
-  net.RunOp();
-
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
+  ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
 }
 
 }
diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc
index fa5db7cd470683d97147ee5baf52fb98f3f4753c..61de748b0fc8b8928eb99f8ecdc7e9dc72bca932 100644
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -6,6 +6,9 @@
 
 namespace mace {
 
-REGISTER_OPENCL_OPERATOR(BatchToSpaceND, BatchToSpaceNDOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BatchToSpaceND")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         BatchToSpaceNDOp<DeviceType::OPENCL, float>);
 
 }  //  namespace mace
diff --git a/mace/ops/buffer_to_image.cc b/mace/ops/buffer_to_image.cc
index d7eeade2620852361844e1e84edb96ecc3b4e281..56711794b7fef1546ec67e63d873289bea2ef1cc 100644
--- a/mace/ops/buffer_to_image.cc
+++ b/mace/ops/buffer_to_image.cc
@@ -6,6 +6,14 @@
 
 namespace mace {
 
-REGISTER_OPENCL_OPERATOR(BufferToImage, BufferToImageOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         BufferToImageOp<DeviceType::OPENCL, float>);
+
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         BufferToImageOp<DeviceType::OPENCL, half>);
 
 }  //  namespace mace
diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc
index ea5fbe21592830bcc31ef303311b15aba3b3a98c..43092084d3f75cacf48ecf9dc9dd3fd3861f557d 100644
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
       .Input("Input")
       .Output("B2IOutput")
       .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
       .Finalize(net.NewOperatorDef());
 
   // Add input data
@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
       .Input("B2IOutput")
       .Output("I2BOutput")
       .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
       .Finalize(net.NewOperatorDef());
 
   // Run
@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) {
   TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
 }
 
+TEST(BufferToImageTest, ArgHalfSmall) {
+  TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
+}
+
 TEST(BufferToImageTest, ArgMedia) {
   TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
 }
@@ -91,3 +97,36 @@ TEST(BufferToImageTest, Filter3x3Meida) {
 TEST(BufferToImageTest, Filter3x3Large) {
   TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256});
 }
+
+template<DeviceType D, typename T>
+void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
+  OpsTestNet net;
+  OpDefBuilder("BufferToImage", "BufferToImageTest")
+      .Input("Input")
+      .Output("B2IOutput")
+      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddRandomInput<D, float>("Input", input_shape);
+
+  // Run
+  net.RunOp(D);
+
+  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
+      .Input("B2IOutput")
+      .Output("I2BOutput")
+      .AddIntArg("buffer_type", type)
+      .Finalize(net.NewOperatorDef());
+
+  // Run
+  net.RunOp(D);
+
+  // Check
+  ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-3);
+}
+
+TEST(BufferToImageTest, ArgFloatToHalfSmall) {
+  TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
+}
diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
index e76a091c251d01699fe9cc3b9bbdde1791541d82..7d36b1af13034ec0a1d51b451edf3df449f83752 100644
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -6,6 +6,9 @@
 
 namespace mace {
 
-REGISTER_CPU_OPERATOR(ChannelShuffle, ChannelShuffleOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("ChannelShuffle")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      ChannelShuffleOp<DeviceType::CPU, float>);
 
 }  // namespace mace
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
index ec47971b72babc3c50b2ec78d1a8554f8c7deb38..df040904bff47587143f4580c07516444341a7b6 100644
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -6,6 +6,9 @@
 
 namespace mace {
 
-REGISTER_CPU_OPERATOR(Concat, ConcatOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Concat")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      ConcatOp<DeviceType::CPU, float>);
 
 }  // namespace mace
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index b3886b296d6b01e21bcc414475ae0f03534df5b8..617bd2c5600670513f67140979fd3ccee3ed6c98 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -6,12 +6,31 @@
 
 namespace mace {
 
-REGISTER_CPU_OPERATOR(Conv2D, Conv2dOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      Conv2dOp<DeviceType::CPU, float>);
+
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
+                          .TypeConstraint<half>("T")
+                          .Build(),
+                      Conv2dOp<DeviceType::CPU, half>);
 
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(Conv2D, Conv2dOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       Conv2dOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 
-REGISTER_OPENCL_OPERATOR(Conv2D, Conv2dOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         Conv2dOp<DeviceType::OPENCL, float>);
+
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         Conv2dOp<DeviceType::OPENCL, half>);
 
 }  // namespace mace
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 24211ca1832921c89828b6ec00f45e33a152b77c..b7f6fc731dc0e092d74c5ef6b7434e61e79635f1 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -33,9 +33,9 @@ static void Conv2d(int iters,
   net.AddRandomInput<D, float>("Bias", {output_channels});
 
   if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
         .Input("FilterImage")
@@ -44,6 +44,7 @@ static void Conv2d(int iters,
         .AddIntsArg("strides", {stride, stride})
         .AddIntArg("padding", padding)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
   } else {
     OpDefBuilder("Conv2D", "Conv2dTest")
@@ -54,6 +55,7 @@ static void Conv2d(int iters,
         .AddIntsArg("strides", {stride, stride})
         .AddIntArg("padding", padding)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
   }
 
@@ -91,39 +93,39 @@ static void Conv2d(int iters,
   BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
 
 // ICNet
-BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float);
-BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float);
+BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
 // SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
-BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float);
+BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
 // SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
-BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float);
+BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
 
+BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
-BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, VALID, 32, float);
+BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
 
 // Test RGB <-> YUV
-BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
-BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
-
-BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments
-BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
-BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
-BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
-BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
-BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
-BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
-BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
-BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
-BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
-BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
+//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
+//
+//BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments
+//BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
+//BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
+//BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
+//BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
+//BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
+//BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
 }  //  namespace mace
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index 6120f403b31af34c5689fdd2664ede5924edd826..711bf3891211451429fc3ad0e80e1f55611a4b70 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -84,23 +84,23 @@ TEST_F(Conv2dOpTest, NEONSimple) {
   TestSimple3x3SAME<DeviceType::NEON>();
 }
 
-template<DeviceType D>
+template<DeviceType D, typename T>
 void TestNHWCSimple3x3VALID() {
   OpsTestNet net;
   // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Input", {1, 3, 3, 2},
       {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Filter", {3, 3, 2, 1},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
 
   if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
         .Input("FilterImage")
@@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() {
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
 
     net.RunOp(D);
 
     // Transfer output
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
 
   } else {
     OpDefBuilder("Conv2D", "Conv2dTest")
@@ -125,33 +126,34 @@ void TestNHWCSimple3x3VALID() {
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
   }
 
   auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
 
-template<DeviceType D>
+template<DeviceType D, typename T>
 void TestNHWCSimple3x3SAME() {
   OpsTestNet net;
 
   // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Input", {1, 3, 3, 2},
       {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Filter", {3, 3, 2, 1},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
 
   if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
         .Input("FilterImage")
@@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() {
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::SAME)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
 
     // Transfer output
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
 
   } else {
     OpDefBuilder("Conv2D", "Conv2dTest")
@@ -176,6 +179,7 @@ void TestNHWCSimple3x3SAME() {
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::SAME)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
@@ -185,17 +189,17 @@ void TestNHWCSimple3x3SAME() {
       {1, 3, 3, 1},
       {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
 
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
 
 TEST_F(Conv2dOpTest, CPUSimple) {
-  TestNHWCSimple3x3VALID<DeviceType::CPU>();
-  TestNHWCSimple3x3SAME<DeviceType::CPU>();
+  TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
+  TestNHWCSimple3x3SAME<DeviceType::CPU, float>();
 }
 
 TEST_F(Conv2dOpTest, OPENCLSimple) {
-  TestNHWCSimple3x3VALID<DeviceType::OPENCL>();
-  TestNHWCSimple3x3SAME<DeviceType::OPENCL>();
+  TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
+  TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
 }
 
 template<DeviceType D>
@@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) {
   TestSimple3x3WithoutBias<DeviceType::NEON>();
 }
 
-template<DeviceType D>
+template<DeviceType D, typename T>
 void TestNHWCSimple3x3WithoutBias() {
   OpsTestNet net;
 
   // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Input", {1, 3, 3, 2},
       {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Filter", {3, 3, 2, 1},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
 
   if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
@@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() {
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
     // Transfer output
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
   } else {
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("Input")
@@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() {
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
 
     // Run
@@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() {
   // Check
   auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.0f});
 
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 }
 
 TEST_F(Conv2dOpTest, CPUWithoutBias) {
-  TestNHWCSimple3x3WithoutBias<DeviceType::CPU>();
+  TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
 }
 
 TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
-  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL>();
+  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
 }
 
 template<DeviceType D>
@@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) {
   TestCombined3x3<DeviceType::NEON>();
 }
 
-template<DeviceType D>
+template<DeviceType D, typename T>
 static void TestNHWCCombined3x3() {
   // Construct graph
   OpsTestNet net;
 
   // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Filter", {3, 3, 2, 2},
       {1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
        1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
        1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f});
-  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
+  net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f});
 
   if (D == DeviceType::OPENCL) {
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("InputImage")
@@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() {
         .AddIntsArg("strides", {2, 2})
         .AddIntArg("padding", Padding::SAME)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
 
-    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
   } else {
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("Input")
@@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() {
         .AddIntsArg("strides", {2, 2})
         .AddIntArg("padding", Padding::SAME)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
@@ -388,27 +396,22 @@ static void TestNHWCCombined3x3() {
       {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f,
                      12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f,
                      8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
 
 }
 
-TEST_F(Conv2dOpTest, CPUCombined) {
-  TestNHWCCombined3x3<DeviceType::CPU>();
+TEST_F(Conv2dOpTest, CPUStride2) {
+  TestNHWCCombined3x3<DeviceType::CPU, float>();
+}
+
+TEST_F(Conv2dOpTest, OPENCLStride2) {
+  TestNHWCCombined3x3<DeviceType::OPENCL, float>();
 }
 
 template<DeviceType D>
 void TestConv1x1() {
   // Construct graph
   OpsTestNet net;
-  OpDefBuilder("Conv2D", "Conv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
 
   // Add input data
   net.AddInputFromArray<D, float>(
@@ -425,8 +428,37 @@ void TestConv1x1() {
       {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
   net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
 
-  // Run
-  net.RunOp(D);
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, float>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
 
   // Check
   auto expected = CreateTensor<float>(
@@ -445,11 +477,11 @@ TEST_F(Conv2dOpTest, CPUConv1x1) {
   TestConv1x1<DeviceType::CPU>();
 }
 
-//TEST_F(Conv2dOpTest, OPENCLConv1x1) {
-//  TestConv1x1<DeviceType::OPENCL>();
-//}
+TEST_F(Conv2dOpTest, OPENCLConv1x1) {
+  TestConv1x1<DeviceType::OPENCL>();
+}
 
-template<DeviceType D>
+template<DeviceType D, typename T>
 static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
@@ -457,11 +489,11 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
     srand(time(NULL));
 
     // generate random input
-    index_t batch = 3 + rand() % 10;
+    index_t batch = 3 + (rand() % 10);
     index_t height = shape[0];
     index_t width = shape[1];
-    index_t input_channels = shape[2] + rand() % 10;
-    index_t output_channels = shape[3] + rand() % 10;
+    index_t input_channels = shape[2] + (rand() % 10);
+    index_t output_channels = shape[3] + (rand() % 10);
     // Construct graph
     OpsTestNet net;
     OpDefBuilder("Conv2D", "Conv2dTest")
@@ -472,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntArg("padding", type)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
 
     // Add input data
-    net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
-    net.AddRandomInput<D, float>(
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
         "Filter", {kernel_h, kernel_w, input_channels, output_channels});
-    net.AddRandomInput<D, float>("Bias", {output_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
 
     // run on cpu
     net.RunOp();
@@ -487,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
     expected.Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
-    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
@@ -499,25 +532,136 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntArg("padding", type)
         .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
     // Run on device
     net.RunOp(D);
 
-    ImageToBuffer<D>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
     ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
   };
 
-  for (int kernel_size : {3}) {
-    for (int stride : {1}) {
+  for (int kernel_size : {1, 3}) {
+    for (int stride : {1, 2}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
       func(kernel_size, kernel_size, stride, stride, SAME);
     }
   }
 }
 
 TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 64, 128});
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 32, 64});
 }
 
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL>({107, 113, 5, 7});
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
+}
+
+template<DeviceType D>
+static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
+                                      const std::vector<index_t> &filter_shape) {
+  testing::internal::LogToStderr();
+  srand(time(NULL));
+
+  auto func = [&](int stride_h, int stride_w, Padding padding) {
+    // generate random input
+    index_t batch = 3 + (rand() % 10);
+    index_t height = input_shape[0];
+    index_t width = input_shape[1];
+    index_t kernel_h = filter_shape[0];
+    index_t kernel_w = filter_shape[1];
+    index_t input_channels = filter_shape[2] + (rand() % 10);
+    index_t output_channels = filter_shape[3] + (rand() % 10);
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", padding)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+
+    std::vector<float> float_input_data;
+    GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data);
+    std::vector<float> float_filter_data;
+    GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
+    std::vector<float> float_bias_data;
+    GenerateRandomRealTypeData({output_channels}, float_bias_data);
+    // Add input data
+    net.AddInputFromArray<D, float>("Input", {batch, height, width, input_channels}, float_input_data);
+    net.AddInputFromArray<D, float>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
+    net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
+
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", padding)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
+  };
+
+  for (int stride : {1, 2}) {
+    func(stride, stride, VALID);
+    func(stride, stride, SAME);
+  }
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+                                                {1, 1, 32, 64});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+                                                {3, 3, 32, 64});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+                                                {15, 1, 256, 2});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+                                                {1, 15, 256, 2});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+                                                {7, 7, 3, 64});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
+                                                {1, 1, 5, 7});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
+                                                {3, 3, 5, 7});
 }
diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc
index 992a6f2aa4584b6a9c5a1378885237fd19af6725..b8cb2e5be759a4838351ceb0405f075a3bbbf364 100644
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -6,15 +6,21 @@
 
 namespace mace {
 
-REGISTER_CPU_OPERATOR(DepthwiseConv2d,
+REGISTER_CPU_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                       DepthwiseConv2dOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(DepthwiseConv2d,
+REGISTER_NEON_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                        DepthwiseConv2dOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 
-REGISTER_OPENCL_OPERATOR(DepthwiseConv2d,
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("DepthwiseConv2d")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                          DepthwiseConv2dOp<DeviceType::OPENCL, float>);
 
 }  // namespace mace
diff --git a/mace/ops/fused_conv_2d.cc b/mace/ops/fused_conv_2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e6b0172f9e04cd2d0a098cd701431506856f7f9
--- /dev/null
+++ b/mace/ops/fused_conv_2d.cc
@@ -0,0 +1,30 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/ops/fused_conv_2d.h"
+
+namespace mace {
+
+REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      FusedConv2dOp<DeviceType::CPU, float>);
+
+REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D")
+                          .TypeConstraint<half>("T")
+                          .Build(),
+                      FusedConv2dOp<DeviceType::CPU, half>);
+
+
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         FusedConv2dOp<DeviceType::OPENCL, float>);
+
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         FusedConv2dOp<DeviceType::OPENCL, half>);
+
+}  // namespace mace
diff --git a/mace/ops/fused_conv_2d.h b/mace/ops/fused_conv_2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6baafeaa27365141168511facafb68cc3573073
--- /dev/null
+++ b/mace/ops/fused_conv_2d.h
@@ -0,0 +1,46 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_OPS_FUSED_CONV_2D_H_
+#define MACE_OPS_FUSED_CONV_2D_H_
+
+#include <memory>
+
+#include "mace/core/operator.h"
+#include "mace/kernels/fused_conv_2d.h"
+#include "mace/ops/conv_pool_2d_base.h"
+
+namespace mace {
+
+template <DeviceType D, typename T>
+class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
+ public:
+  FusedConv2dOp(const OperatorDef &op_def, Workspace *ws)
+      : ConvPool2dOpBase<D, T>(op_def, ws),
+        functor_(this->strides_.data(), this->padding_,
+                 this->dilations_.data()) {
+  }
+
+  bool Run() override {
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *filter = this->Input(FILTER);
+    const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr;
+    Tensor *output = this->Output(OUTPUT);
+
+    functor_(input, filter, bias, output);
+
+    return true;
+  }
+
+ private:
+  kernels::FusedConv2dFunctor<D, T> functor_;
+
+ protected:
+  OP_INPUT_TAGS(INPUT, FILTER, BIAS);
+  OP_OUTPUT_TAGS(OUTPUT);
+};
+
+}  // namespace mace
+
+#endif  // MACE_OPS_FUSED_CONV_2D_H_
diff --git a/mace/ops/fused_conv_2d_test.cc b/mace/ops/fused_conv_2d_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..896fbbc6ae700ce99968414c052c1ae07119c49c
--- /dev/null
+++ b/mace/ops/fused_conv_2d_test.cc
@@ -0,0 +1,410 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/ops/fused_conv_2d.h"
+#include "mace/ops/ops_test_util.h"
+
+using namespace mace;
+
+class FusedConv2dOpTest : public OpsTestBase {};
+
+template<DeviceType D, typename T>
+void TestNHWCSimple3x3VALID() {
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, T>(
+      "Input", {1, 3, 3, 2},
+      {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
+  net.AddInputFromArray<D, T>(
+      "Filter", {3, 3, 2, 1},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  net.AddInputFromArray<D, T>("Bias", {1}, {-0.1f});
+
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+
+  } else {
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
+
+  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
+}
+
+template<DeviceType D, typename T>
+void TestNHWCSimple3x3SAME() {
+  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<D, T>(
+      "Input", {1, 3, 3, 2},
+      {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
+  net.AddInputFromArray<D, T>(
+      "Filter", {3, 3, 2, 1},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  net.AddInputFromArray<D, T>("Bias", {1}, {-0.1f});
+
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::SAME)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+
+  } else {
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::SAME)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
+
+  auto expected = CreateTensor<float>(
+      {1, 3, 3, 1},
+      {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
+}
+
+TEST_F(FusedConv2dOpTest, CPUSimple) {
+  TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
+  TestNHWCSimple3x3SAME<DeviceType::CPU, float>();
+}
+
+TEST_F(FusedConv2dOpTest, OPENCLSimple) {
+  TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
+  TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
+}
+
+template<DeviceType D, typename T>
+void TestNHWCSimple3x3WithoutBias() {
+  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<D, T>(
+      "Input", {1, 3, 3, 2},
+      {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
+  net.AddInputFromArray<D, T>(
+      "Filter", {3, 3, 2, 1},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+  }
+
+  // Check
+  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+
+  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
+}
+
+TEST_F(FusedConv2dOpTest, CPUWithoutBias) {
+  TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
+}
+
+TEST_F(FusedConv2dOpTest, OPENCLWithoutBias) {
+  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
+}
+
+template<DeviceType D>
+void TestConv1x1() {
+  // Construct graph
+  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {1, 3, 10, 5},
+      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  net.AddInputFromArray<D, float>(
+      "Filter", {1, 1, 5, 2},
+      {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
+  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
+
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, float>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
+
+  // Check
+  auto expected = CreateTensor<float>(
+      {1, 3, 10, 2},
+      {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+}
+
+TEST_F(FusedConv2dOpTest, CPUConv1x1) {
+  TestConv1x1<DeviceType::CPU>();
+}
+
+TEST_F(FusedConv2dOpTest, OPENCLConv1x1) {
+  TestConv1x1<DeviceType::OPENCL>();
+}
+
+template<DeviceType D, typename T>
+static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
+  testing::internal::LogToStderr();
+  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
+                  Padding type) {
+    srand(time(NULL));
+
+    // generate random input
+    index_t batch = 3 + (rand() % 10);
+    index_t height = shape[0];
+    index_t width = shape[1];
+    index_t input_channels = shape[2] + (rand() % 10);
+    index_t output_channels = shape[3] + (rand() % 10);
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+
+    // Add input data
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
+
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
+  };
+
+  for (int kernel_size : {1, 3}) {
+    for (int stride : {1, 2}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
+      func(kernel_size, kernel_size, stride, stride, SAME);
+    }
+  }
+}
+
+TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) {
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
+}
+
+template<DeviceType D>
+static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
+  testing::internal::LogToStderr();
+  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
+                  Padding type) {
+    srand(time(NULL));
+
+    // generate random input
+    index_t batch = 3 + (rand() % 10);
+    index_t height = shape[0];
+    index_t width = shape[1];
+    index_t input_channels = shape[2] + (rand() % 10);
+    index_t output_channels = shape[3] + (rand() % 10);
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+
+    std::vector<float> float_input_data;
+    GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data);
+    std::vector<float> float_filter_data;
+    GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
+    std::vector<float> float_bias_data;
+    GenerateRandomRealTypeData({output_channels}, float_bias_data);
+    // Add input data
+    net.AddInputFromArray<D, float>("Input", {batch, height, width, input_channels}, float_input_data);
+    net.AddInputFromArray<D, float>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
+    net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
+
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.2);
+  };
+
+  for (int kernel_size : {1, 3}) {
+    for (int stride : {1, 2}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
+    }
+  }
+}
+
+TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConvNxNS12) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64});
+}
+
diff --git a/mace/ops/global_avg_pooling.cc b/mace/ops/global_avg_pooling.cc
index d507d76fa63ed34c02761c551142faa6a9886a0d..534378445ca59b05af2d5c7e89b46d198b14c4f4 100644
--- a/mace/ops/global_avg_pooling.cc
+++ b/mace/ops/global_avg_pooling.cc
@@ -6,11 +6,15 @@
 
 namespace mace {
 
-REGISTER_CPU_OPERATOR(GlobalAvgPooling,
+REGISTER_CPU_OPERATOR(OpKeyBuilder("GlobalAvgPooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                       GlobalAvgPoolingOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(GlobalAvgPooling,
+REGISTER_NEON_OPERATOR(OpKeyBuilder("GlobalAvgPooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                        GlobalAvgPoolingOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 
diff --git a/mace/ops/image_to_buffer.cc b/mace/ops/image_to_buffer.cc
index f41d7475cb9282bae2ff5c23bb3c246738e40774..bcf8b997b2b6da5620bdb340c785e47f37915b37 100644
--- a/mace/ops/image_to_buffer.cc
+++ b/mace/ops/image_to_buffer.cc
@@ -6,6 +6,14 @@
 
 namespace mace {
 
-REGISTER_OPENCL_OPERATOR(ImageToBuffer, ImageToBufferOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ImageToBuffer")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         ImageToBufferOp<DeviceType::OPENCL, float>);
+
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ImageToBuffer")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         ImageToBufferOp<DeviceType::OPENCL, half>);
 
 }  //  namespace mace
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 6bdf5db5b8835766304299c679f974a32376bf6c..8d593940cf0c5059d5064a27c7edb3558b9f559b 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -13,6 +13,7 @@
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 
@@ -209,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
                                 std::vector<T> &res) {
   std::random_device rd;
   std::mt19937 gen(rd());
-  std::normal_distribution<T> nd(0, 1);
+  std::normal_distribution<float> nd(0, 1);
 
   index_t size = std::accumulate(shape.begin(), shape.end(), 1,
                                  std::multiplies<index_t>());
   res.resize(size);
 
-  std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
+  if (DataTypeToEnum<T>::value == DT_HALF) {
+    std::generate(res.begin(), res.end(), [&gen, &nd] { return half_float::half_cast<half>(nd(gen)); });
+  } else {
+    std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
+  }
 }
 
 template <typename T>
@@ -289,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) {
   EXPECT_DOUBLE_EQ(a, b);
 }
 
-inline void AssertSameTypeDims(const Tensor &x, const Tensor &y) {
-  ASSERT_EQ(x.dtype(), y.dtype());
+inline void AssertSameDims(const Tensor &x, const Tensor &y) {
   ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs "
                                 << "y.shape [ " << ShapeToString(y) << "]";
 }
 
-template <typename T, bool is_fp = is_floating_point_type<T>::value>
+template <typename EXP_TYPE, typename RES_TYPE, bool is_fp = is_floating_point_type<EXP_TYPE>::value>
 struct Expector;
 
 // Partial specialization for float and double.
-template <typename T>
-struct Expector<T, true> {
-  static void Equal(const T &a, const T &b) { ExpectEqual(a, b); }
+template <typename EXP_TYPE, typename RES_TYPE>
+struct Expector<EXP_TYPE, RES_TYPE, true> {
+  static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
 
   static void Equal(const Tensor &x, const Tensor &y) {
-    ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
-    AssertSameTypeDims(x, y);
+    ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
+    ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
+    AssertSameDims(x, y);
     Tensor::MappingGuard x_mapper(&x);
     Tensor::MappingGuard y_mapper(&y);
-    auto a = x.data<T>();
-    auto b = y.data<T>();
+    auto a = x.data<EXP_TYPE>();
+    auto b = y.data<RES_TYPE>();
     for (int i = 0; i < x.size(); ++i) {
       ExpectEqual(a(i), b(i));
     }
   }
 
   static void Near(const Tensor &x, const Tensor &y, const double abs_err) {
-    ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
-    AssertSameTypeDims(x, y);
+    ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
+    ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
+    AssertSameDims(x, y);
     Tensor::MappingGuard x_mapper(&x);
     Tensor::MappingGuard y_mapper(&y);
-    auto a = x.data<T>();
-    auto b = y.data<T>();
+    auto a = x.data<EXP_TYPE>();
+    auto b = y.data<RES_TYPE>();
     for (int i = 0; i < x.size(); ++i) {
       EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
                                        << " index = " << i;
@@ -334,17 +340,18 @@ template <typename T>
 void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
   static_assert(is_floating_point_type<T>::value,
                 "T is not a floating point type");
-  Expector<T>::Near(x, y, abs_err);
+  Expector<T, T>::Near(x, y, abs_err);
 }
 
-template <typename T>
-std::string ToString(const T &input) {
-  std::stringstream ss;
-  ss << input;
-  return ss.str();
+template <typename EXP_TYPE, typename RES_TYPE>
+void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
+  static_assert(is_floating_point_type<EXP_TYPE>::value
+                    && is_floating_point_type<RES_TYPE>::value,
+                "T is not a floating point type");
+  Expector<EXP_TYPE, RES_TYPE>::Near(x, y, abs_err);
 }
 
-template <DeviceType D>
+template <DeviceType D, typename T>
 void BufferToImage(OpsTestNet &net,
                    const std::string &input_name,
                    const std::string &output_name,
@@ -353,6 +360,7 @@ void BufferToImage(OpsTestNet &net,
       .Input(input_name)
       .Output(output_name)
       .AddIntArg("buffer_type", type)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
   // Run
@@ -361,7 +369,7 @@ void BufferToImage(OpsTestNet &net,
   net.Sync();
 }
 
-template <DeviceType D>
+template <DeviceType D, typename T>
 void ImageToBuffer(OpsTestNet &net,
                    const std::string &input_name,
                    const std::string &output_name,
@@ -370,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net,
       .Input(input_name)
       .Output(output_name)
       .AddIntArg("buffer_type", type)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
   // Run
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
index 1c4f1af2f55c9f8ea5f2455f3bf6d0ad84f36ac7..17031378f7e93ac6924f794ec352d3009181179d 100644
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -6,11 +6,29 @@
 
 namespace mace {
 
-REGISTER_CPU_OPERATOR(Pooling, PoolingOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                      PoolingOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling")
+                          .TypeConstraint<half>("T")
+                          .Build(),
+                      PoolingOp<DeviceType::CPU, half>);
 
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(Pooling, PoolingOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("Pooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       PoolingOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 
-REGISTER_OPENCL_OPERATOR(Pooling, PoolingOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         PoolingOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         PoolingOp<DeviceType::OPENCL, half>);
+
 }  //  namespace mace
diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h
index f62992f53ed44abae64383e300b873433e9b0216..bbc653ab75d627a412d5fcdfaf5c67772658f24f 100644
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
     const Tensor *input = this->Input(INPUT);
     Tensor *output = this->Output(OUTPUT);
 
-    std::vector<index_t> output_shape(4);
-    std::vector<int> paddings(2);
-    std::vector<index_t> filter_shape(4);
-    // TODO(chenghui): is it kind of a hack?
-    filter_shape[0] = input->shape()[1];
-    filter_shape[1] = input->shape()[0];
-    filter_shape[2] = kernels_[0];
-    filter_shape[3] = kernels_[1];
-
-    kernels::CalcPaddingAndOutputSize(
-        input->shape().data(), filter_shape.data(), this->dilations_.data(),
-        this->strides_.data(), this->padding_, output_shape.data(),
-        paddings.data());
-    output->Resize(output_shape);
-
     functor_(input, output);
     return true;
   };
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index bf2b182467426292a2fef53dddd0a0d9d3b09dfc..dcda06b75483e6e0e01cfe16594991d72171d2bf 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -28,48 +28,20 @@ TEST_F(PoolingOpTest, MAX_VALID) {
 
   // Add input data
   net.AddInputFromArray<DeviceType::CPU, float>(
-      "Input", {1, 2, 4, 4},
-      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
+      "Input", {1, 4, 4, 2},
+      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+       8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
 
   // Run
   net.RunOp();
 
   // Check
   auto expected =
-      CreateTensor<float>({1, 2, 2, 2}, {5, 7, 13, 15, 21, 23, 29, 31});
+      CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
-TEST_F(PoolingOpTest, AVG_VALID) {
-  // Construct graph
-  auto &net = test_net();
-  OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntsArg("kernels", {2, 2})
-      .AddIntsArg("strides", {2, 2})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("pooling_type", PoolingType::AVG)
-      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
-      "Input", {1, 2, 4, 4},
-      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
-
-  // Run
-  net.RunOp();
-
-  // Check
-  auto expected = CreateTensor<float>(
-      {1, 2, 2, 2}, {2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5});
-
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
-}
 
 TEST_F(PoolingOpTest, MAX_SAME) {
   // Construct graph
@@ -85,14 +57,14 @@ TEST_F(PoolingOpTest, MAX_SAME) {
       .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 1, 3, 3},
-                               {0, 1, 2, 3, 4, 5, 6, 7, 8});
+  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
+                                                {0, 1, 2, 3, 4, 5, 6, 7, 8});
 
   // Run
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 2}, {4, 5, 7, 8});
+  auto expected = CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -112,14 +84,14 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
 
   // Add input data
   net.AddInputFromArray<DeviceType::CPU, float>(
-      "Input", {1, 1, 4, 4},
+      "Input", {1, 4, 4, 1},
       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 
   // Run
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 2}, {10, 11, 14, 15});
+  auto expected = CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -139,42 +111,57 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
 
   // Add input data
   net.AddInputFromArray<DeviceType::CPU, float>(
-      "Input", {1, 1, 2, 9},
+      "Input", {1, 2, 9, 1},
       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
   // Run
-  net.RunOp(DeviceType::NEON);
+  net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 5}, {10, 12, 14, 16, 17});
+  auto expected = CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
-
-template <DeviceType D>
+template<DeviceType D>
 static void SimpleMaxPooling3S2() {
   // Construct graph
   OpsTestNet net;
-  OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntArg("pooling_type", PoolingType::MAX)
-      .AddIntsArg("kernels", {3, 3})
-      .AddIntsArg("strides", {2, 2})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
 
   // Add input data
   net.AddInputFromArray<D, float>(
-      "Input", {1, 1, 3, 9},
-      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+      "Input", {1, 3, 9, 1},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
        14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
-  // Run
-  net.RunOp(D);
+
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    OpDefBuilder("Pooling", "PoolingTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .AddIntArg("pooling_type", PoolingType::MAX)
+        .AddIntsArg("kernels", {3, 3})
+        .AddIntsArg("strides", {2, 2})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    net.RunOp(D);
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    // Run
+    OpDefBuilder("Pooling", "PoolingTest")
+        .Input("Input")
+        .Output("Output")
+        .AddIntArg("pooling_type", PoolingType::MAX)
+        .AddIntsArg("kernels", {3, 3})
+        .AddIntsArg("strides", {2, 2})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    net.RunOp(D);
+  }
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 4}, {20, 22, 24, 26});
+  auto expected = CreateTensor<float>({1, 1, 4, 1}, {20, 22, 24, 26});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -182,15 +169,15 @@ static void SimpleMaxPooling3S2() {
 TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) {
   SimpleMaxPooling3S2<CPU>();
 }
-TEST_F(PoolingOpTest, NEONSimpleMaxPooling3S2) {
-  SimpleMaxPooling3S2<NEON>();
-}
+
 TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
   SimpleMaxPooling3S2<OPENCL>();
 }
 
-template <DeviceType D>
-static void AlignedMaxPooling3S2(Padding padding) {
+template<DeviceType D, typename T>
+static void MaxPooling3S2(const std::vector<index_t> &input_shape,
+                          const std::vector<int> strides,
+                          Padding padding) {
   // Construct graph
   OpsTestNet net;
   OpDefBuilder("Pooling", "PoolingTest")
@@ -198,22 +185,35 @@ static void AlignedMaxPooling3S2(Padding padding) {
       .Output("Output")
       .AddIntArg("pooling_type", PoolingType::MAX)
       .AddIntsArg("kernels", {3, 3})
-      .AddIntsArg("strides", {2, 2})
+      .AddIntsArg("strides", strides)
       .AddIntArg("padding", padding)
       .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddRandomInput<D, float>("Input", {3, 128, 64, 64});
-  // Run
-  net.RunOp(D);
+  net.AddRandomInput<D, T>("Input", input_shape);
+
+  // run on cpu
+  net.RunOp();
   Tensor expected;
   expected.Copy(*net.GetOutput("Output"));
 
-  // Run on cpu
-  net.RunOp();
+  BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+  OpDefBuilder("Pooling", "PoolingTest")
+      .Input("InputImage")
+      .Output("OutputImage")
+      .AddIntArg("pooling_type", PoolingType::MAX)
+      .AddIntsArg("kernels", {3, 3})
+      .AddIntsArg("strides", strides)
+      .AddIntArg("padding", padding)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  net.RunOp(D);
+  ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
 
-  ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001);
+  ExpectTensorNear<T>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
 }
 
 // TODO(chenghui) : there is a bug.
@@ -223,152 +223,158 @@ static void AlignedMaxPooling3S2(Padding padding) {
 //}
 
 TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) {
-  AlignedMaxPooling3S2<OPENCL>(Padding::VALID);
-  AlignedMaxPooling3S2<OPENCL>(Padding::SAME);
+  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
+  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
+  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
+  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
+}
+
+TEST_F(PoolingOpTest, OPENCLHalfAlignedMaxPooling3S2) {
+  MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
+  MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
+  MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
+  MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
 }
 
-template <DeviceType D>
-static void UnalignedMaxPooling3S2(Padding padding) {
+TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
+  MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {1, 1}, Padding::VALID);
+  MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {2, 2}, Padding::VALID);
+  MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {1, 1}, Padding::SAME);
+  MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {2, 2}, Padding::SAME);
+}
+
+TEST_F(PoolingOpTest, AVG_VALID) {
   // Construct graph
-  OpsTestNet net;
+  auto &net = test_net();
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("Input")
       .Output("Output")
-      .AddIntArg("pooling_type", PoolingType::MAX)
-      .AddIntsArg("kernels", {3, 3})
+      .AddIntsArg("kernels", {2, 2})
       .AddIntsArg("strides", {2, 2})
-      .AddIntArg("padding", padding)
+      .AddIntArg("padding", Padding::VALID)
       .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("pooling_type", PoolingType::AVG)
       .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddRandomInput<D, float>("Input", {3, 113, 43, 47});
-  // Run
-  net.RunOp(D);
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  net.AddInputFromArray<DeviceType::CPU, float>(
+      "Input", {1, 4, 4, 2},
+      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+       8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
 
-  // Run on cpu
+  // Run
   net.RunOp();
 
-  ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001);
-}
-
-// TODO(chenghui) : there is a bug.
-//TEST_F(PoolingOpTest, NEONUnalignedMaxPooling3S2) {
-//  UnalignedMaxPooling3S2<NEON>();
-//}
+  // Check
+  auto expected = CreateTensor<float>(
+      {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
 
-TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
-  UnalignedMaxPooling3S2<OPENCL>(Padding::VALID);
-  UnalignedMaxPooling3S2<OPENCL>(Padding::SAME);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
-template <DeviceType D>
+template<DeviceType D>
 static void SimpleAvgPoolingTest() {
   // Construct graph
   OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {1, 2, 8, 1},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+
+  BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
   OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
-      .Output("Output")
+      .Input("InputImage")
+      .Output("OutputImage")
       .AddIntArg("pooling_type", PoolingType::AVG)
       .AddIntsArg("kernels", {2, 2})
       .AddIntsArg("strides", {2, 2})
       .AddIntArg("padding", Padding::SAME)
       .AddIntsArg("dilations", {1, 1})
       .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddInputFromArray<D, float>(
-      "Input", {1, 1, 2, 8},
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
   // Run
   net.RunOp(D);
+  ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 4}, {4.5, 6.5, 8.5, 10.5});
+  auto expected = CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
-TEST_F(PoolingOpTest, NEONSimpleAvgPooling) {
-  SimpleAvgPoolingTest<NEON>();
-}
-
 TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
   SimpleAvgPoolingTest<OPENCL>();
 }
 
-template <DeviceType D>
-static void AlignedAvgPoolingTest(Padding padding) {
+template<DeviceType D, typename T>
+static void AvgPoolingTest(const std::vector<index_t> &shape,
+                                  const std::vector<int> &kernels,
+                                  const std::vector<int> &strides,
+                                  Padding padding) {
   // Construct graph
   OpsTestNet net;
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("Input")
       .Output("Output")
       .AddIntArg("pooling_type", PoolingType::AVG)
-      .AddIntsArg("kernels", {4, 4})
-      .AddIntsArg("strides", {4, 4})
+      .AddIntsArg("kernels", kernels)
+      .AddIntsArg("strides", strides)
       .AddIntArg("padding", padding)
       .AddIntsArg("dilations", {1, 1})
       .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddRandomInput<D, float>("Input", {3, 128, 15, 15});
-  // Run
-  net.RunOp(D);
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  net.AddRandomInput<D, float>("Input", shape);
 
-  // Run on cpu
+  // run on cpu
   net.RunOp();
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
 
-  ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5);
-}
-
-TEST_F(PoolingOpTest, NEONAlignedAvgPooling) {
-  AlignedAvgPoolingTest<NEON>(Padding::VALID);
-  AlignedAvgPoolingTest<NEON>(Padding::SAME);
-}
-
-TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
-  AlignedAvgPoolingTest<OPENCL>(Padding::VALID);
-  AlignedAvgPoolingTest<OPENCL>(Padding::SAME);
-}
-
-template <DeviceType D>
-static void UnAlignedAvgPoolingTest(Padding padding) {
-  // Construct graph
-  OpsTestNet net;
+  BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
   OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
-      .Output("Output")
+      .Input("InputImage")
+      .Output("OutputImage")
       .AddIntArg("pooling_type", PoolingType::AVG)
-      .AddIntsArg("kernels", {7, 7})
-      .AddIntsArg("strides", {7, 7})
+      .AddIntsArg("kernels", kernels)
+      .AddIntsArg("strides", strides)
       .AddIntArg("padding", padding)
       .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddRandomInput<D, float>("Input", {3, 128, 31, 37});
-  // Run
   net.RunOp(D);
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
 
-  // Run on cpu
-  net.RunOp();
+  ExpectTensorNear<float, T>(expected, *net.GetOutput("OPENCLOutput"), 0.01);
+}
 
-  ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5);
+TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
+  AvgPoolingTest<OPENCL, float>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
+  AvgPoolingTest<OPENCL, float>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
 }
 
-TEST_F(PoolingOpTest, NEONUnAlignedAvgPooling) {
-  UnAlignedAvgPoolingTest<NEON>(Padding::VALID);
-  UnAlignedAvgPoolingTest<NEON>(Padding::SAME);
+TEST_F(PoolingOpTest, OPENCLHalfAlignedAvgPooling) {
+  AvgPoolingTest<OPENCL, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
+  AvgPoolingTest<OPENCL, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
+}
+
+TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) {
+  AvgPoolingTest<OPENCL, float>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID);
+  AvgPoolingTest<OPENCL, float>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME);
+}
+
+TEST_F(PoolingOpTest, OPENCLHalfAlignedLargeKernelAvgPooling) {
+  AvgPoolingTest<OPENCL, half>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID);
+  AvgPoolingTest<OPENCL, half>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME);
 }
 
 TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) {
-  UnAlignedAvgPoolingTest<OPENCL>(Padding::VALID);
-  UnAlignedAvgPoolingTest<OPENCL>(Padding::SAME);
+  AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::VALID);
+  AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::SAME);
 }
+
+TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
+  AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::VALID);
+  AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::SAME);
+}
+
diff --git a/mace/ops/relu.cc b/mace/ops/relu.cc
index 3365439398af6d1aded3d1f28304958da097b7ab..f471ae64665f34ed9b109fdf5c3f2c1c79ce7320 100644
--- a/mace/ops/relu.cc
+++ b/mace/ops/relu.cc
@@ -6,10 +6,16 @@
 
 namespace mace {
 
-REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Relu")
+                          .TypeConstraint<float>("T")
+                          .Build(),
+                      ReluOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(OpKeyBuilder("Relu")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                       ReluOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 
 REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")
diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc
index b8b24ced3b006c88bdd449e923d32c47b79567b7..8eae71819537a99cc08454e1585844f7d77f52e3 100644
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -6,14 +6,26 @@
 
 namespace mace {
 
-REGISTER_CPU_OPERATOR(ResizeBilinear, ResizeBilinearOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("ResizeBilinear")
+                          .TypeConstraint<float>("T")
+                          .Build(),
+                      ResizeBilinearOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(ResizeBilinear,
+REGISTER_NEON_OPERATOR(OpKeyBuilder("ResizeBilinear")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                        ResizeBilinearOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 
-REGISTER_OPENCL_OPERATOR(ResizeBilinear,
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ResizeBilinear")
+                             .TypeConstraint<float>("T")
+                             .Build(),
                          ResizeBilinearOp<DeviceType::OPENCL, float>);
 
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("ResizeBilinear")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         ResizeBilinearOp<DeviceType::OPENCL, half>);
+
 }  //  namespace mace
diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc
index 8429fd6bee0f8617e98268cd4ce97be43935a44c..d9453908c11bff15ad8ee3c996af03523d6fb7d1 100644
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -19,18 +19,30 @@ static void ResizeBilinearBenchmark(int iters,
   mace::testing::StopTiming();
 
   OpsTestNet net;
-  OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
-      .Input("Input")
-      .Input("OutSize")
-      .Output("Output")
-      .AddIntsArg("size", {output_height, output_width})
-      .Finalize(net.NewOperatorDef());
 
   // Add input data
   net.AddRandomInput<D, float>("Input",
-                               {batch, channels, input_height, input_width});
+                               {batch, input_height, input_width, channels});
   net.AddInputFromArray<D, index_t>("OutSize", {2},
                                     {output_height, output_width});
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
+      .Input("InputImage")
+      .Input("OutSize")
+      .Output("OutputImage")
+      .AddIntsArg("size", {output_height, output_width})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
+      .Input("Input")
+      .Input("OutSize")
+      .Output("Output")
+      .AddIntsArg("size", {output_height, output_width})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  }
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
@@ -58,9 +70,12 @@ static void ResizeBilinearBenchmark(int iters,
 
 #define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1, TYPE)        \
   BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, CPU);  \
-  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, NEON); \
   BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, OPENCL);
 
+// SNPE 835 GPU: 6870us
+BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, half);
+BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, float);
+
 BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15, float);
 BM_RESIZE_BILINEAR(1, 256, 15, 15, 30, 30, float);
 BM_RESIZE_BILINEAR(1, 128, 30, 30, 60, 60, float);
diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc
index 7b7cee9d97da3afd98e80ff710815f06cf1d8eef..3e50c3b4c15133238fb2e7b937430dc8d13dffdd 100644
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -23,14 +23,14 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
   // Add input data
   vector<float> input(24);
   std::iota(begin(input), end(input), 0);
-  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 2, 4}, input);
+  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
   net.AddInputFromArray<DeviceType::CPU, int>("OutSize", {2}, {1, 2});
 
   // Run
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 3, 1, 2}, {0, 2, 8, 10, 16, 18});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -49,14 +49,14 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
   // Add input data
   vector<float> input(24);
   std::iota(begin(input), end(input), 0);
-  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 2, 4}, input);
+  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
   net.AddInputFromArray<DeviceType::CPU, int>("OutSize", {2}, {1, 2});
 
   // Run
   net.RunOp();
 
   // Check
-  auto expected = CreateTensor<float>({1, 3, 1, 2}, {0, 3, 8, 11, 16, 19});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -65,6 +65,7 @@ template <DeviceType D>
 void TestRandomResizeBilinear() {
   srand(time(nullptr));
   testing::internal::LogToStderr();
+
   for (int round = 0; round < 10; ++round) {
     int batch = 1 + rand() % 5;
     int channels = 1 + rand() % 100;
@@ -72,39 +73,54 @@ void TestRandomResizeBilinear() {
     int width = 1 + rand() % 100;
     int in_height = 1 + rand() % 100;
     int in_width = 1 + rand() % 100;
+    int align_corners = rand() % 1;
 
     // Construct graph
     OpsTestNet net;
-    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
-        .Input("Input")
-        .Input("OutSize")
-        .Output("Output")
-        .AddIntArg("align_corners", 1)
-        .AddIntsArg("size", {height, width})
-        .Finalize(net.NewOperatorDef());
-
     // Add input data
     net.AddRandomInput<D, float>("Input",
-                                 {batch, channels, in_height, in_width});
+                                 {batch, in_height, in_width, channels});
     net.AddInputFromArray<D, int>("OutSize", {2}, {height, width});
 
-    // Run
-    net.RunOp(D);
-    Tensor actual;
-    actual.Copy(*net.GetOutput("Output"));
-
+    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
+      .Input("Input")
+      .Input("OutSize")
+      .Output("Output")
+      .AddIntArg("align_corners", align_corners)
+      .AddIntsArg("size", {height, width})
+      .Finalize(net.NewOperatorDef());
     // Run on CPU
     net.RunOp(DeviceType::CPU);
-    Tensor *expected = net.GetOutput("Output");
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    if (D == DeviceType::OPENCL) {
+      BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+
+      OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
+        .Input("InputImage")
+        .Input("OutSize")
+        .Output("OutputImage")
+        .AddIntArg("align_corners", align_corners)
+        .AddIntsArg("size", {height, width})
+        .Finalize(net.NewOperatorDef());
+      // Run
+      net.RunOp(D);
 
+      ImageToBuffer<D, float>(net, "OutputImage", "DeviceOutput", kernels::BufferType::IN_OUT);
+    } else {
+      // TODO support NEON
+    }
     // Check
-    ExpectTensorNear<float>(*expected, actual, 0.001);
+    ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 0.001);
   }
 }
 
+/*
 TEST_F(ResizeBilinearTest, NEONRandomResizeBilinear) {
   TestRandomResizeBilinear<DeviceType::NEON>();
 }
+*/
 
 TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) {
   TestRandomResizeBilinear<DeviceType::OPENCL>();
diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc
index 8a7af417768038f6cb66048a375bb6e5ff8fa402..fec9866872e94aa4aa1dd2f218d0585ebdc776c1 100644
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -6,6 +6,9 @@
 
 namespace mace {
 
-REGISTER_OPENCL_OPERATOR(SpaceToBatchND, SpaceToBatchNDOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("SpaceToBatchND")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         SpaceToBatchNDOp<DeviceType::OPENCL, float>);
 
 }  //  namespace mace
diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto
index 8e6808219ce27cbe4f122a1e4b909c71f7bab7a3..119e1fed79a7cad1374cdb3891745ec2c83716bb 100644
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -67,12 +67,20 @@ message NodeInput {
   optional int32 output_port = 2;
 }
 
+message OutputShape {
+  repeated int64 dims = 1;
+}
+
 message OperatorDef {
   repeated string input = 1;
   repeated string output = 2;
   optional string name = 3;
   optional string type = 4;
   repeated Argument arg = 5;
+  repeated OutputShape output_shape = 6;
+
+  // Memory optimization: only support one single output op
+  optional int32 mem_id = 10 [default = -1];
 
   // for hexagon mace-nnlib
   optional uint32 node_id = 100;
@@ -82,6 +90,16 @@ message OperatorDef {
   repeated int32 out_max_byte_size = 104; // only support 32-bit len
 }
 
+// for memory optimization
+message MemoryBlock {
+  optional int32 mem_id = 1;
+  optional uint32 x = 2;
+  optional uint32 y = 3;
+}
+message MemoryArena {
+  repeated MemoryBlock mem_block = 1;
+}
+
 // for hexagon mace-nnlib
 message InputInfo {
   optional string name = 1;
diff --git a/mace/python/tools/tf_converter.py b/mace/python/tools/tf_converter.py
index fbf19f5b7cf8d705683a959bece07067ac43a5f9..d30a463ca2bce938d716e799f82049308e044586 100644
--- a/mace/python/tools/tf_converter.py
+++ b/mace/python/tools/tf_converter.py
@@ -21,7 +21,7 @@ def main(unused_args):
 
   if FLAGS.runtime == 'dsp':
     output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
-      input_graph_def, FLAGS.input_node, FLAGS.output_node)
+      input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize)
   else:
     output_graph_def = tf_converter_lib.convert_to_mace_pb(
       input_graph_def)
@@ -62,6 +62,11 @@ def parse_args():
     type=str,
     default="softmax",
     help="e.g., softmax")
+  parser.add_argument(
+    "--prequantize",
+    type=bool,
+    default=False,
+    help="e.g., False")
   return parser.parse_known_args()
 
 
diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py
index 97575bf23ce9583f1db75ce37d5bc699d0f0189e..27df84accf8859a20454f4c512ce688ccea8081a 100644
--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -18,15 +18,6 @@ def convert_tensor(op, tensor):
   tensor.name = op.outputs[0].name
 
   shape = list(tf_tensor.shape)
-  if (op.name.find('pointwise_kernel') != -1 or
-          op.name.find('depthwise_kernel') != -1 or
-        op.name.endswith('weights') or
-        op.name.endswith('kernel')) \
-          and op.outputs[0].consumers()[0].type.find('Conv') != -1:
-    if op.outputs[0].consumers()[0].get_attr('data_format') == 'NHWC':
-      tf_tensor = np.transpose(tf_tensor, axes=(3, 2, 0, 1))
-      shape = [shape[3], shape[2], shape[0], shape[1]]
-      # print (tensor.name, shape)
   tensor.dims.extend(shape)
 
   tf_dt = op.get_attr('dtype')
@@ -66,6 +57,12 @@ def convert_ops(unresolved_ops, net_def):
       op_def.type = first_op.type
     op_def.input.extend([input.name for input in first_op.inputs])
     op_def.output.extend([output.name for output in first_op.outputs])
+    output_shapes = []
+    for output in first_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
     padding_arg = op_def.arg.add()
     padding_arg.name = 'padding'
     padding_arg.i = padding_mode[first_op.get_attr('padding')]
@@ -74,7 +71,7 @@ def convert_ops(unresolved_ops, net_def):
     strides_arg.ints.extend(first_op.get_attr('strides')[1:3])
     data_format_arg = op_def.arg.add()
     data_format_arg.name = 'data_format'
-    data_format_arg.s = 'NCHW'
+    data_format_arg.s = 'NHWC'
 
     if ops_count >= 2 and unresolved_ops[1].type == 'BiasAdd':
       bias_add_op = unresolved_ops[1]
@@ -105,6 +102,12 @@ def convert_ops(unresolved_ops, net_def):
     op_def.type = 'BatchNorm'
     op_def.input.extend([input_name, gamma, beta, mean, variance, epsilon])
     op_def.output.extend([output.name for output in add_1_op.outputs])
+    output_shapes = []
+    for output in add_1_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
 
     resolved_count = 7
   elif first_op.type == 'Relu6':
@@ -113,6 +116,12 @@ def convert_ops(unresolved_ops, net_def):
     op_def.type = 'Relu'
     op_def.input.extend([input.name for input in first_op.inputs])
     op_def.output.extend([output.name for output in first_op.outputs])
+    output_shapes = []
+    for output in first_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
     max_limit_arg = op_def.arg.add()
     max_limit_arg.name = 'max_limit'
     max_limit_arg.f = 6
@@ -122,6 +131,12 @@ def convert_ops(unresolved_ops, net_def):
     op_def.type = 'Pooling'
     op_def.input.extend([input.name for input in first_op.inputs])
     op_def.output.extend([output.name for output in first_op.outputs])
+    output_shapes = []
+    for output in first_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
     pooling_type_arg = op_def.arg.add()
     pooling_type_arg.name = 'pooling_type'
     pooling_type_arg.i = pooling_type_mode[first_op.type]
@@ -136,21 +151,46 @@ def convert_ops(unresolved_ops, net_def):
     kernels_arg.ints.extend(first_op.get_attr('ksize')[1:3])
     data_format_arg = op_def.arg.add()
     data_format_arg.name = 'data_format'
-    data_format_arg.s = 'NCHW'
+    data_format_arg.s = 'NHWC'
   elif first_op.type == 'Add':
     op_def = net_def.op.add()
     op_def.name = first_op.name
     op_def.type = "AddN"
     op_def.input.extend([input.name for input in first_op.inputs])
     op_def.output.extend([output.name for output in first_op.outputs])
-  elif first_op.type in ['Relu', 'ResizeBilinear', 'SpaceToBatchND', 'BatchToSpaceND']:
+    output_shapes = []
+    for output in first_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
+  elif first_op.type == 'ConcatV2':
+    op_def = net_def.op.add()
+    op_def.name = first_op.name
+    op_def.type = "Concat"
+    op_def.input.extend([input.name for input in first_op.inputs])
+    op_def.output.extend([output.name for output in first_op.outputs])
+    output_shapes = []
+    for output in first_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
+  elif first_op.type in ['Relu', 'ResizeBilinear', 'SpaceToBatchND',
+                         'BatchToSpaceND', 'BiasAdd', 'FusedBatchNorm']:
     op_def = net_def.op.add()
     op_def.name = first_op.name
     op_def.type = first_op.type
     op_def.input.extend([input.name for input in first_op.inputs])
     op_def.output.extend([output.name for output in first_op.outputs])
+    output_shapes = []
+    for output in first_op.outputs:
+      output_shape = mace_pb2.OutputShape()
+      output_shape.dims.extend(output.shape.as_list())
+      output_shapes.append(output_shape)
+    op_def.output_shape.extend(output_shapes)
   else:
-    raise Exception('Unknown Op: ' + first_op.name)
+    raise Exception('Unknown Op: %s, type: %s' % (first_op.name, first_op.type))
     pass
 
   for i in range(resolved_count):
diff --git a/mace/python/tools/tf_dsp_converter_lib.py b/mace/python/tools/tf_dsp_converter_lib.py
index 8f925059279d2b50b13fc28aaf1aca975ec67bc7..ced16ce853e8f49b9c968e09ed257a8e3bf815b5 100644
--- a/mace/python/tools/tf_dsp_converter_lib.py
+++ b/mace/python/tools/tf_dsp_converter_lib.py
@@ -5,7 +5,7 @@ from dsp_ops import DspOps
 from mace.python.tools import graph_util
 
 # converter --input ../libcv/quantized_icnet.pb --output quantized_icnet_dsp.pb \
-# --runtime dsp --input_dim input_node,1,480,480,3 --output_node icnet/output_node
+# --runtime dsp --input_node input_node --output_node output_node
 
 padding_mode = {
   'NA': 0,
@@ -208,8 +208,8 @@ def reverse_batch_to_space_and_biasadd(net_def):
             for follow_op in follow_ops:
               new_follow_op = mace_pb2.OperatorDef()
               new_follow_op.CopyFrom(follow_op)
-              for i in range(len(follow_op.input)):
-                for k in range(3):
+              for i in xrange(len(follow_op.input)):
+                for k in xrange(3):
                   if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k):
                     new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k)
               new_ops.append(new_follow_op)
@@ -220,9 +220,7 @@ def reverse_batch_to_space_and_biasadd(net_def):
 
   new_net_def = mace_pb2.NetDef()
   new_net_def.tensors.extend(tensor_map.values())
-  for op in net_def.op:
-    if op.name not in skip_ops:
-      new_net_def.op.extend([op])
+  new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
   new_net_def.op.extend(new_ops)
 
   return new_net_def
@@ -249,29 +247,101 @@ def add_node_id(net_def):
 
   return net_def
 
-def add_input_output_info(net_def, input_node, output_node, graph):
+def add_input_output_info(net_def, input_node, output_node, graph, dtype):
   input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0))
   output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0))
 
-  for op in net_def.op:
-    if op.name == input_node:
+  input_info = net_def.input_info.add()
+  input_info.dims.extend(input_tensor.shape.as_list())
+  input_info.data_type = dtype
+  if dtype == mace_pb2.DT_UINT8:
+    for i in xrange(2):
       input_info = net_def.input_info.add()
-      input_info.name = op.name
-      input_info.node_id = op.node_id
-      input_info.dims.extend(input_tensor.shape.as_list())
-      input_info.max_byte_size = max_elem_size(input_tensor)
-      input_info.data_type = find_dtype(input_tensor.dtype)
-    elif op.name == output_node:
+      input_info.dims.extend([1,1,1,1])
+      input_info.data_type = mace_pb2.DT_FLOAT
+
+  output_info = net_def.output_info.add()
+  output_info.dims.extend(output_tensor.shape.as_list())
+  output_info.data_type = dtype
+  if dtype == mace_pb2.DT_UINT8:
+    for i in xrange(2):
       output_info = net_def.output_info.add()
-      output_info.name = op.name
-      output_info.node_id = op.node_id
-      output_info.dims.extend(output_tensor.shape.as_list())
-      output_info.max_byte_size = max_elem_size(output_tensor)
-      output_info.data_type = find_dtype(output_tensor.dtype)
+      output_info.dims.extend([1,1,1,1])
+      output_info.data_type = mace_pb2.DT_FLOAT
 
   return net_def
 
-def convert_to_mace_pb(input_graph_def, input_node, output_node):
+def strip_input_quantize_and_output_dequantize(net_def, input_node, output_node):
+  tensor_map = {}
+  for tensor in net_def.tensors:
+    tensor_map[tensor.name] = tensor
+  op_map = {}
+  for op in net_def.op:
+    op_map[op.name] = op
+  consumers = {}
+  for op in net_def.op:
+    for ipt in op.input:
+      if ipt not in consumers:
+        consumers[ipt] = []
+      consumers[ipt].append(op)
+
+  skip_ops = set()
+  new_ops = []
+  skip_tensors = set()
+
+  # INPUT->Flatten->Minf, Maxf->Quantize
+  for op in net_def.op:
+    if op.type == 'INPUT':
+      input_op = op
+      flatten_op = None
+      quantize_op = None
+      for o in consumers[get_tensor_name_from_op(input_op.name, 0)]:
+        if o.type == 'Flatten':
+          flatten_op = o
+        elif o.type == 'Quantize':
+          quantize_op = o
+      if quantize_op is not None:
+        minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)]
+        skip_ops = skip_ops.union([input_op.name, flatten_op.name, minf_op.name, maxf_op.name, quantize_op.name])
+        skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
+
+        new_input_op = mace_pb2.OperatorDef()
+        new_input_op.name = input_op.name
+        new_input_op.type = input_op.type
+        new_input_op.padding = input_op.padding
+        new_input_op.out_max_byte_size.extend([input_op.out_max_byte_size[0]/4, 4, 4])
+        new_ops.append(new_input_op)
+        for follow_op in consumers[get_tensor_name_from_op(quantize_op.name, 0)]:
+          new_follow_op = mace_pb2.OperatorDef()
+          new_follow_op.CopyFrom(follow_op)
+          for i in xrange(len(follow_op.input)):
+            for k in xrange(3):
+              if new_follow_op.input[i] == get_tensor_name_from_op(quantize_op.name, k):
+                new_follow_op.input[i] = get_tensor_name_from_op(input_op.name, k)
+          new_ops.append(new_follow_op)
+          skip_ops.add(follow_op.name)
+
+    elif op.type == 'OUTPUT':
+      output_op = op
+      dequantize_op = get_node_from_map(op_map, output_op.input[0])
+      if dequantize_op.type == 'Dequantize':
+        skip_ops = skip_ops.union([dequantize_op.name, output_op.name])
+
+        new_output_op = mace_pb2.OperatorDef()
+        new_output_op.name = output_op.name
+        new_output_op.type = output_op.type
+        new_output_op.input.extend(dequantize_op.input)
+        new_ops.append(new_output_op)
+
+
+
+  new_net_def = mace_pb2.NetDef()
+  new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors])
+  new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
+  new_net_def.op.extend(new_ops)
+  return new_net_def
+
+def convert_to_mace_pb(input_graph_def, input_node, output_node, prequantize=False):
   """
     nnlib does not have batch norm, so use tensorflow optimizer to fold
      batch norm with convolution. The fold optimization reorders ops, so
@@ -298,10 +368,18 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node):
 
       add_output_node(net_def, output_node)
       # optimized_net_def = reverse_batch_to_space_and_biasadd(net_def)
+
+      if prequantize:
+        net_def = strip_input_quantize_and_output_dequantize(net_def, input_node, output_node)
+
       sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__')
       net_def_with_node_id = add_node_id(sorted_net_def)
 
-      final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph)
+      if prequantize:
+        dtype = mace_pb2.DT_UINT8
+      else:
+        dtype = mace_pb2.DT_FLOAT
+      final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype)
 
   return final_net_def
 
diff --git a/mace/python/tools/tf_ops_stats.py b/mace/python/tools/tf_ops_stats.py
index 9301b3f1a5d8537418704ea9b73e50c30460f545..d60487a96434bf1fbda63f0bb456a973e4c07b9b 100644
--- a/mace/python/tools/tf_ops_stats.py
+++ b/mace/python/tools/tf_ops_stats.py
@@ -68,7 +68,7 @@ def main(unused_args):
           if input_name.endswith('weights:0') and input_name in tensor_shapes:
             ksize = tensor_shapes[input_name]
             break
-        print('%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' % (op.type, padding, strides, ksize, data_format, op.inputs[0].shape.as_list(), op.outputs[0].shape.as_list()))
+        print('%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' % (op.type, padding, strides, ksize, data_format, op.inputs[0].shape, op.outputs[0].shape))
         key = '%s(padding=%s, strides=%s, ksize=%s, format=%s)' % (op.type, padding, strides, ksize, data_format)
         hist_inc(stats, key)
       elif op.type in ['FusedResizeAndPadConv2D']:
@@ -92,6 +92,7 @@ def main(unused_args):
             size = tensor_values[input_name]
             break
         key = '%s(size=%s, align_corners=%s)' % (op.type, size, align_corners)
+        print(key)
         hist_inc(stats, key)
       elif op.type in ['AvgPool', 'MaxPool']:
         padding = op.get_attr('padding')
diff --git a/mace/utils/utils.h b/mace/utils/utils.h
index 536a7fb8805bc136e3b235151bbfb433b6c96836..a8b13828de5208047292218a27d76e3f328923b7 100644
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -6,6 +6,7 @@
 #define MACE_UTILS_UTILS_H_
 
 #include <sys/time.h>
+#include <sstream>
 
 namespace mace {
 template <typename Integer>
@@ -40,5 +41,12 @@ inline int64_t NowInMicroSec() {
   return static_cast<int64_t>(tv.tv_sec * 1000000 + tv.tv_usec);
 }
 
+template <typename T>
+inline std::string ToString(T v) {
+  std::ostringstream ss;
+  ss << v;
+  return ss.str();
+}
+
 }  //  namespace mace
 #endif  //  MACE_UTILS_UTILS_H_
diff --git a/tools/bazel-adb-run.sh b/tools/bazel-adb-run.sh
index fbd4fa007803aa2e6939485ca6b1601ad6b56dc1..b41d4d140303d8b682c49d40d23a35abe81b68c3 100755
--- a/tools/bazel-adb-run.sh
+++ b/tools/bazel-adb-run.sh
@@ -22,7 +22,10 @@ ANDROID_ABI=arm64-v8a
 STRIP=""
 STRIP="--strip always"
 
-bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI
+# for profiling
+bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI --define profiling=true
+#bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI
+
 if [ $? -ne 0 ]; then
   exit 1
 fi