diff --git a/mace/kernels/opencl/cl/relu.cl b/mace/kernels/opencl/cl/relu.cl
index 33fe65d00b2496315b86cde094cd4459f17c3a51..17d391658e367e7b862ff713c177766a70e203f3 100644
--- a/mace/kernels/opencl/cl/relu.cl
+++ b/mace/kernels/opencl/cl/relu.cl
@@ -1,35 +1,29 @@
 #include <common.h>
 
 // Supported data type: half/float
-__kernel void relu(__global const DATA_TYPE *input,
-                   __private const int size,
-                   __global DATA_TYPE *output) {
-  int idx = get_global_id(0);
+__kernel void relu(__read_only image2d_t input,
+                   __write_only image2d_t output) {
+  const int ch_blk = get_global_id(0);
+  const int w = get_global_id(1);
+  const int hb = get_global_id(2);
+  const int width = get_global_size(1);
 
-  if (idx + 4 > size) {
-    for(; idx < size; ++idx) {
-      *(output+idx) = fmax(*(input+idx), 0);
-    }
-  } else {
-    VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
-    data = fmax(data, (VEC_DATA_TYPE(DATA_TYPE,4))0);
-    vstore4(data, idx, output);
-  }
+  const int pos = ch_blk * width + w;
+  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
+  DATA_TYPE4 out = fmax(in, (DATA_TYPE4)0);
+  WRITE_IMAGET(output, (int2)(pos, hb), out);
 }
 
-__kernel void relux(__global const DATA_TYPE *input,
+__kernel void relux(__read_only image2d_t input,
                     __private const DATA_TYPE max_limit,
-                    __private const int size,
-                    __global DATA_TYPE *output) {
-  int idx = get_global_id(0);
+                    __write_only image2d_t output) {
+  const int ch_blk = get_global_id(0);
+  const int w = get_global_id(1);
+  const int hb = get_global_id(2);
+  const int width = get_global_size(1);
 
-  if (idx + 4 > size) {
-    for(; idx < size; ++idx) {
-      *(output+idx) = clamp(*(input+idx), 0.0f, max_limit);
-    }
-  } else {
-    VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
-    data = clamp(data, (VEC_DATA_TYPE(DATA_TYPE,4))0, (VEC_DATA_TYPE(DATA_TYPE,4))max_limit);
-    vstore4(data, idx, output);
-  }
+  const int pos = ch_blk * width + w;
+  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
+  DATA_TYPE4 out = clamp(in, (DATA_TYPE4)0, (DATA_TYPE4)max_limit);
+  WRITE_IMAGET(output, (int2)(pos, hb), out);
 }
diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc
index 1149b965a2fc91c5394c97b7028d872b827dc125..46988793525883d1305c33a6cf548f70f58c2025 100644
--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -6,58 +6,70 @@
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 namespace kernels {
 
-template <>
-void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
+template <typename T>
+void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                         Tensor *output) {
 
-  index_t element_size = input->NumElements();
-  index_t blocks = (element_size + 3) / 4;
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
 
-  const uint32_t gws = blocks;
+  const index_t channel_blocks = RoundUpDiv4(channels);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
 
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
 
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto dt = DataTypeToEnum<T>::value;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
   if (max_limit_ < 0) {
     auto relu_kernel  = runtime->BuildKernel("relu", "relu", built_options);
-    const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
+    const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
+    const uint32_t lws[3] = {1, kwg_size, 1};
 
     uint32_t idx = 0;
     relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-    relu_kernel.setArg(idx++, static_cast<int32_t>(element_size));
     relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
 
     cl_int error = runtime->command_queue().enqueueNDRangeKernel(
         relu_kernel, cl::NullRange,
-        cl::NDRange(gws),
-        cl::NDRange(lws),
+        cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]),
         NULL, OpenCLRuntime::Get()->GetDefaultEvent());
     MACE_CHECK(error == CL_SUCCESS);
   } else {
     auto relu_kernel  = runtime->BuildKernel("relu", "relux", built_options);
-
-    const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
+    const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
+    const uint32_t lws[3] = {1, kwg_size, 1};
 
     uint32_t idx = 0;
     relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
     relu_kernel.setArg(idx++, max_limit_);
-    relu_kernel.setArg(idx++, static_cast<int32_t>(element_size));
     relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
 
     cl_int error = runtime->command_queue().enqueueNDRangeKernel(
         relu_kernel, cl::NullRange,
-        cl::NDRange(gws),
-        cl::NDRange(lws),
+        cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]),
         NULL, OpenCLRuntime::Get()->GetDefaultEvent());
     MACE_CHECK(error == CL_SUCCESS);
   }
 }
 
+template
+struct ReluFunctor<DeviceType::OPENCL, float>;
+template
+struct ReluFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/relu.h b/mace/kernels/relu.h
index 347d586bee77a4c3948d032919ce1bdf1970e945..ea94c79a96ad523653ec0defd53902af3ac698f6 100644
--- a/mace/kernels/relu.h
+++ b/mace/kernels/relu.h
@@ -33,11 +33,15 @@ struct ReluFunctor {
 template <>
 void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                                                       Tensor *output);
-template <>
-void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
-                                                        Tensor *output);
+
+template <typename T>
+struct ReluFunctor<DeviceType::OPENCL, T> {
+  T max_limit_;
+
+  void operator()(const Tensor *input, Tensor *output);
+};
 
 }  //  namespace kernels
 }  //  namespace mace
 
-#endif  // MACE_KERNELS_RELU_H_
\ No newline at end of file
+#endif  // MACE_KERNELS_RELU_H_
diff --git a/mace/ops/relu.cc b/mace/ops/relu.cc
index c86fb38f1f9a56f8d0721593e48e1bfb4b67db05..3365439398af6d1aded3d1f28304958da097b7ab 100644
--- a/mace/ops/relu.cc
+++ b/mace/ops/relu.cc
@@ -12,5 +12,14 @@ REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>);
 REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 
-REGISTER_OPENCL_OPERATOR(Relu, ReluOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         ReluOp<DeviceType::OPENCL, float>);
+
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         ReluOp<DeviceType::OPENCL, half>);
+
 }  //  namespace mace
diff --git a/mace/ops/relu.h b/mace/ops/relu.h
index fea49c8da843ce1f6cc3010209b2d8697963342a..613776a870b736b9a2166339301674bf484fe4fe 100644
--- a/mace/ops/relu.h
+++ b/mace/ops/relu.h
@@ -16,7 +16,7 @@ class ReluOp : public Operator<D, T> {
   ReluOp(const OperatorDef &operator_def, Workspace *ws)
       : Operator<D, T>(operator_def, ws) {
     functor_.max_limit_ =
-        OperatorBase::GetSingleArgument<T>("max_limit", static_cast<T>(-1));
+        OperatorBase::GetSingleArgument<float>("max_limit", static_cast<float>(-1));
   }
   bool Run() override {
     const Tensor *input_tensor = this->inputs_[0];
diff --git a/mace/ops/relu_benchmark.cc b/mace/ops/relu_benchmark.cc
index 14badcd9f98083b3b50f93d672cfc4a53d692b27..c68009c9cb1c6bd29806078c69ab47ff005689fa 100644
--- a/mace/ops/relu_benchmark.cc
+++ b/mace/ops/relu_benchmark.cc
@@ -9,17 +9,28 @@
 
 namespace mace {
 template <DeviceType D, typename T>
-static void ReluBenchmark(int iters, int size) {
+static void ReluBenchmark(
+    int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
-  OpDefBuilder("Relu", "ReluBM")
-      .Input("Input")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddRandomInput<D, float>("Input", {size});
+  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+
+    OpDefBuilder("Relu", "ReluBM")
+        .Input("InputImage")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder("Relu", "ReluBM")
+        .Input("Input")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  }
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
@@ -34,21 +45,23 @@ static void ReluBenchmark(int iters, int size) {
   net.Sync();
 }
 
-#define BM_RELU_MACRO(SIZE, TYPE, DEVICE)                     \
-  static void BM_RELU_##SIZE##_##TYPE##_##DEVICE(int iters) { \
-    const int64_t tot = static_cast<int64_t>(iters) * SIZE;   \
-    mace::testing::ItemsProcessed(tot);                       \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));       \
-    ReluBenchmark<DEVICE, TYPE>(iters, SIZE);                 \
-  }                                                           \
-  BENCHMARK(BM_RELU_##SIZE##_##TYPE##_##DEVICE)
-
-#define BM_RELU(SIZE, TYPE)       \
-  BM_RELU_MACRO(SIZE, TYPE, CPU); \
-  BM_RELU_MACRO(SIZE, TYPE, NEON);\
-  BM_RELU_MACRO(SIZE, TYPE, OPENCL);
-
-BM_RELU(1000, float);
-BM_RELU(100000, float);
-BM_RELU(10000000, float);
-}  //  namespace mace
\ No newline at end of file
+#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE)                      \
+  static void BM_RELU_##N##C##H##W##_##TYPE##_##DEVICE(int iters) {  \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
+    mace::testing::ItemsProcessed(tot);                              \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
+    ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                  \
+  }                                                                  \
+  BENCHMARK(BM_RELU_##N##C##H##W##_##TYPE##_##DEVICE)
+
+#define BM_RELU(N, C, H, W, TYPE)       \
+  BM_RELU_MACRO(N, C, H, W, TYPE, CPU); \
+  BM_RELU_MACRO(N, C, H, W, TYPE, NEON);\
+  BM_RELU_MACRO(N, C, H, W, TYPE, OPENCL);
+
+BM_RELU(1, 1, 512, 512, float);
+BM_RELU(1, 3, 128, 128, float);
+BM_RELU(1, 3, 512, 512, float);
+BM_RELU(1, 32, 112, 112, float);
+BM_RELU(1, 64, 256, 256, float);
+}  //  namespace mace
diff --git a/mace/ops/relu_test.cc b/mace/ops/relu_test.cc
index 56aace07079264d6870411406449ea735dac25ce..879ecb3211f75ebeb791467ae152d40e653ccdc9 100644
--- a/mace/ops/relu_test.cc
+++ b/mace/ops/relu_test.cc
@@ -12,10 +12,6 @@ class ReluOpTest : public OpsTestBase {};
 template <DeviceType D>
 void TestSimple() {
   OpsTestNet net;
-  OpDefBuilder("Relu", "ReluTest")
-      .Input("Input")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
 
   // Add input data
   net.AddInputFromArray<D, float>("Input",
@@ -23,8 +19,28 @@ void TestSimple() {
                                   {-7, 7, -6, 6, -5, 5, -4, 4,
                                    -3, 3, -2, 2, -1, 1, 0, 0});
 
-  // Run
-  net.RunOp(D);
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("Input")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+  }
 
   auto expected = CreateTensor<float>({2, 2, 2, 2},
                                       {0, 7, 0, 6, 0, 5, 0, 4,
@@ -48,20 +64,36 @@ TEST_F(ReluOpTest, OPENCLSimple) {
 template <DeviceType D>
 void TestUnalignedSimple() {
   OpsTestNet net;
-  OpDefBuilder("Relu", "ReluTest")
-      .Input("Input")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
 
   // Add input data
   net.AddInputFromArray<D, float>("Input",
-                                  {1, 1, 3, 2},
+                                  {1, 3, 2, 1},
                                   {-7, 7, -6, 6, -5, 5});
 
-  // Run
-  net.RunOp(D);
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("Input")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+  }
 
-  auto expected = CreateTensor<float>({1, 1, 3, 2},
+  auto expected = CreateTensor<float>({1, 3, 2, 1},
                                       {0, 7, 0, 6, 0, 5});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -82,11 +114,6 @@ TEST_F(ReluOpTest, OPENCLUnalignedSimple) {
 template <DeviceType D>
 void TestSimpleReluX() {
   OpsTestNet net;
-  OpDefBuilder("Relu", "ReluTest")
-      .Input("Input")
-      .Output("Output")
-      .AddFloatArg("max_limit", 6)
-      .Finalize(net.NewOperatorDef());
 
   // Add input data
   net.AddInputFromArray<D, float>("Input",
@@ -94,8 +121,30 @@ void TestSimpleReluX() {
                                   {-7, 7, -6, 6, -5, 5, -4, 4,
                                    -3, 3, -2, 2, -1, 1, 0, 0});
 
-  // Run
-  net.RunOp(D);
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .AddFloatArg("max_limit", 6)
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("Input")
+        .Output("Output")
+        .AddFloatArg("max_limit", 6)
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+  }
 
   auto expected = CreateTensor<float>({2, 2, 2, 2},
                                       {0, 6, 0, 6, 0, 5, 0, 4,
@@ -119,21 +168,38 @@ TEST_F(ReluOpTest, OPENCLSimpleReluX) {
 template <DeviceType D>
 void TestUnalignedSimpleReluX() {
   OpsTestNet net;
-  OpDefBuilder("Relu", "ReluTest")
-      .Input("Input")
-      .Output("Output")
-      .AddFloatArg("max_limit", 6)
-      .Finalize(net.NewOperatorDef());
 
   // Add input data
   net.AddInputFromArray<D, float>("Input",
-                                  {1, 1, 1, 7},
+                                  {1, 1, 7, 1},
                                   {-7, 7, -6, 6, -5, 5, -4});
 
-  // Run
-  net.RunOp(D);
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .AddFloatArg("max_limit", 6)
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("Input")
+        .Output("Output")
+        .AddFloatArg("max_limit", 6)
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+  }
 
-  auto expected = CreateTensor<float>({1, 1, 1, 7},
+  auto expected = CreateTensor<float>({1, 1, 7, 1},
                                       {0, 6, 0, 6, 0, 5, 0});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);