diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index a2b4c5cbe34a550d40224afb9e6c2d77aeacb980..bb2cea3b55dba1d32debfb6d30333f93259b2614 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -8,11 +8,9 @@
 #include <mutex>
 
 #include <dirent.h>
-#include <errno.h>
 
 #include "mace/core/logging.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/core/runtime/opencl/opencl_wrapper.h"
 
 namespace mace {
 namespace {
@@ -66,7 +64,7 @@ bool BuildProgram(OpenCLRuntime *runtime,
   };
 
   *program = cl::Program(runtime->context(), sources);
-  std::string build_options = "-Werror -cl-mad-enable -I" + path;
+  std::string build_options = "-Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path;
   // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
   if (program->build({runtime->device()}, build_options.c_str()) != CL_SUCCESS) {
     if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 057b2a80320130d322a1146698cfcb40334ca0a4..e18c9f93dae38ff8307adecbf321e65252d47427 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -20,15 +20,18 @@ namespace mace {
 class OpenCLRuntime {
  public:
   static OpenCLRuntime *Get();
-  OpenCLRuntime(cl::Context context,
-                cl::Device device,
-                cl::CommandQueue command_queue);
-  ~OpenCLRuntime();
 
   cl::Context &context();
   cl::Device &device();
   cl::CommandQueue &command_queue();
   cl::Program &program();
+ private:
+  OpenCLRuntime(cl::Context context,
+                cl::Device device,
+                cl::CommandQueue command_queue);
+  ~OpenCLRuntime();
+  OpenCLRuntime(const OpenCLRuntime&) = delete;
+  OpenCLRuntime &operator=(const OpenCLRuntime&) = delete;
 
  private:
   cl::Context context_;
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index 5c838be47b1ac118266a64197fd2a462eadaa433..cd3fb4b9bc47b31d82b7f0d053f356f3c1c62f6a 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -13,16 +13,13 @@ namespace kernels {
 
 template <DeviceType D, typename T>
 struct BatchNormFunctor {
-  void operator()(const T *input,
-                  const T *scale,
-                  const T *offset,
-                  const T *mean,
-                  const T *var,
-                  const float variance_epsilon,
-                  const index_t n,
-                  const index_t channel,
-                  const index_t sample_size,
-                  T *output) {
+  void operator()(const Tensor *input,
+                  const Tensor *scale,
+                  const Tensor *offset,
+                  const Tensor *mean,
+                  const Tensor *var,
+                  const Tensor *epsilon,
+                  Tensor *output) {
     // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
     // The calculation formula for inference is
     // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
@@ -31,16 +28,35 @@ struct BatchNormFunctor {
     // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
     // new_offset = \offset - mean * common_val;
     // Y = new_scale * X + new_offset;
-    T new_scale, new_offset;
+    const index_t n = input->dim(0);
+    const index_t channel = input->dim(1);
+    const index_t sample_size = input->dim(2) * input->dim(3);
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard scale_mapper(scale);
+    Tensor::MappingGuard offset_mapper(offset);
+    Tensor::MappingGuard mean_mapper(mean);
+    Tensor::MappingGuard var_mapper(var);
+    Tensor::MappingGuard epsilon_mapper(epsilon);
+    Tensor::MappingGuard output_mapper(output);
+
+    const T *input_ptr = input->data<T>();
+    const T *scale_ptr = scale->data<T>();
+    const T *offset_ptr = offset->data<T>();
+    const T *mean_ptr = mean->data<T>();
+    const T *var_ptr = var->data<T>();
+    const T *epsilon_ptr = epsilon->data<T>();
+    T *output_ptr = output->mutable_data<T>();
+
 #pragma omp parallel for
     for (index_t c = 0; c < channel; ++c) {
-      new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon);
-      new_offset = offset[c] - mean[c] * new_scale;
+      T new_scale = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr);
+      T new_offset = offset_ptr[c] - mean_ptr[c] * new_scale;
       index_t pos = c * sample_size;
 
       for (index_t i = 0; i < n; ++i) {
-        const T *input_sample_ptr = input + pos;
-        T *output_sample_ptr = output + pos;
+        const T *input_sample_ptr = input_ptr + pos;
+        T *output_sample_ptr = output_ptr + pos;
         for (index_t j = 0; j < sample_size; ++j) {
           output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset;
         }
@@ -52,16 +68,23 @@ struct BatchNormFunctor {
 
 template <>
 void BatchNormFunctor<DeviceType::NEON, float>::operator()(
-    const float *input,
-    const float *scale,
-    const float *offset,
-    const float *mean,
-    const float *var,
-    const float variance_epsilon,
-    const index_t n,
-    const index_t channel,
-    const index_t sample_size,
-    float *output);
+    const Tensor *input,
+    const Tensor *scale,
+    const Tensor *offset,
+    const Tensor *mean,
+    const Tensor *var,
+    const Tensor *epsilon,
+    Tensor *output);
+
+template <>
+void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
+    const Tensor *input,
+    const Tensor *scale,
+    const Tensor *offset,
+    const Tensor *mean,
+    const Tensor *var,
+    const Tensor *epsilon,
+    Tensor *output);
 
 }  //  namepsace kernels
 }  //  namespace mace
diff --git a/mace/kernels/neon/batch_norm_neon.cc b/mace/kernels/neon/batch_norm_neon.cc
index cd5fff22a8e608c9c59f986e1f9ee980fb944e4d..295cc59d48d44fe385ae7d86564674d5d8eecc78 100644
--- a/mace/kernels/neon/batch_norm_neon.cc
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -10,38 +10,46 @@ namespace kernels {
 
 template <>
 void BatchNormFunctor<DeviceType::NEON, float>::operator()(
-    const float *input,
-    const float *scale,
-    const float *offset,
-    const float *mean,
-    const float *var,
-    const float variance_epsilon,
-    const index_t n,
-    const index_t channel,
-    const index_t sample_size,
-    float *output) {
+    const Tensor *input,
+    const Tensor *scale,
+    const Tensor *offset,
+    const Tensor *mean,
+    const Tensor *var,
+    const Tensor *epsilon,
+    Tensor *output) {
   // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
   // The calculation formula for inference is
-  // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
-  //          ( \offset - \frac { \scale * mean } { \sqrt{var+\variance_epsilon}
+  // Y = \frac{ \scale } { \sqrt{var+\epsilon} } * X +
+  //          ( \offset - \frac { \scale * mean } { \sqrt{var+\epsilon}
   //          }
-  // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
+  // new_scale = \frac{ \scale } { \sqrt{var+\epsilon} }
   // new_offset = \offset - mean * common_val;
   // Y = new_scale * X + new_offset;
-  float new_scale, new_offset;
+  const index_t n = input->dim(0);
+  const index_t channel = input->dim(1);
+  const index_t sample_size = input->dim(2) * input->dim(3);
+
+  const float *input_ptr = input->data<float>();
+  const float *scale_ptr = scale->data<float>();
+  const float *offset_ptr = offset->data<float>();
+  const float *mean_ptr = mean->data<float>();
+  const float *var_ptr = var->data<float>();
+  const float *epsilon_ptr = epsilon->data<float>();
+  float *output_ptr = output->mutable_data<float>();
+
   index_t count = sample_size >> 2;
   index_t remain_count = sample_size - (count << 2);
 #pragma omp parallel for
   for (index_t c = 0; c < channel; ++c) {
-    new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon);
-    new_offset = offset[c] - mean[c] * new_scale;
+    float new_scale = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr);
+    float new_offset = offset_ptr[c] - mean_ptr[c] * new_scale;
     index_t pos = c * sample_size;
 
     float32x4_t new_scale_f = vdupq_n_f32(new_scale);
     float32x4_t new_offset_f = vdupq_n_f32(new_offset);
     for (index_t i = 0; i < n; ++i) {
-      const float *input_sample_ptr = input + pos;
-      float *output_sample_ptr = output + pos;
+      const float *input_sample_ptr = input_ptr + pos;
+      float *output_sample_ptr = output_ptr + pos;
 
       for (index_t j = 0; j < count; ++j) {
         float32x4_t input_f = vld1q_f32(input_sample_ptr);
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67b810f5149ea51e2456071415857bca467abbf0
--- /dev/null
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -0,0 +1,46 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/kernels/batch_norm.h"
+#include "mace/core/runtime/opencl/cl2.hpp"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+
+namespace mace {
+namespace kernels {
+
+template <>
+void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
+    const Tensor *input,
+    const Tensor *scale,
+    const Tensor *offset,
+    const Tensor *mean,
+    const Tensor *var,
+    const Tensor *epsilon,
+    Tensor *output) {
+  const index_t n = input->dim(0);
+  const index_t channel = input->dim(1);
+  const index_t sample_size = input->dim(2) * input->dim(3);
+
+  auto runtime = OpenCLRuntime::Get();
+  auto program = runtime->program();
+  auto _kernel = cl::Kernel(program, "batch_norm");
+  _kernel.setArg(0, *(static_cast<const cl::Buffer *>(input->buffer())));
+  _kernel.setArg(1, *(static_cast<cl::Buffer *>(scale->buffer())));
+  _kernel.setArg(2, *(static_cast<cl::Buffer *>(offset->buffer())));
+  _kernel.setArg(3, *(static_cast<cl::Buffer *>(mean->buffer())));
+  _kernel.setArg(4, *(static_cast<cl::Buffer *>(var->buffer())));
+  _kernel.setArg(5, *(static_cast<cl::Buffer *>(epsilon->buffer())));
+  _kernel.setArg(6, static_cast<int>(sample_size));
+  _kernel.setArg(7, *(static_cast<cl::Buffer *>(output->buffer())));
+  _kernel.setArg(8, 32u, nullptr);
+  _kernel.setArg(9, 32u, nullptr);
+  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+      _kernel, cl::NullRange,
+      cl::NDRange(n, channel, sample_size),
+      cl::NDRange(1, 1, 128));
+  MACE_CHECK(error == CL_SUCCESS);
+}
+
+}  // namespace kernels
+}  //  namespace mace
\ No newline at end of file
diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl
new file mode 100644
index 0000000000000000000000000000000000000000..d5927071f222b6ff0c0cdb7dd32e2b15979983f8
--- /dev/null
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -0,0 +1,31 @@
+void kernel batch_norm(global const float *input,
+                       global const float *scale,
+                       global const float *offset,
+                       global const float *mean,
+                       global const float *var,
+                       global const float *epsilon,
+                       private const int pixels,
+                       global float *output,
+                       __local float *new_scale,
+                       __local float *new_offset) {
+  const int batch = get_global_id(0);
+  const int channel = get_global_id(1);
+  const int channels = get_global_size(1);
+  const int pixel_offset = get_global_id(2);
+  const unsigned int local_channel = get_local_id(1);
+  const int local_pixel_idx = get_local_id(2);
+
+  if(local_pixel_idx == 0) {
+    new_scale[local_channel] = scale[channel] * rsqrt(var[channel] + *epsilon);
+    new_offset[local_channel] = offset[channel] - mean[channel] * new_scale[local_channel];
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  const int sample_offset = (batch * channels + channel) * pixels + pixel_offset;
+
+  const float *input_ptr = input + sample_offset;
+  float *output_ptr = output + sample_offset;
+  *output_ptr = new_scale[local_channel] * *input_ptr + new_offset[local_channel];
+}
+
diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index 83574b53373fd52226aef06ebd80f392131c6732..e823136d965670cc198ed14c0f682cbf0b152d00 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -17,6 +17,7 @@ cc_library(
     ],
     deps = [
         "//mace/core",
+        "//mace/core:opencl_runtime",
         "@gtest//:gtest",
     ],
 )
@@ -39,7 +40,6 @@ cc_library(
         "-fopenmp",
     ],
     deps = [
-        "//mace/core",
         "//mace/kernels",
         "//mace/proto:cc_proto",
     ],
@@ -72,7 +72,6 @@ cc_test(
     deps = [
         ":ops",
         ":test",
-        "//mace/core",
         "//mace/core:test_benchmark_main",
     ],
 )
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index f5b050f1ef1f4910eb9b3366735968daf99b7630..1ce9b1e090bbf171bbe3ff33c07512af12e94c80 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -12,4 +12,6 @@ REGISTER_CPU_OPERATOR(BatchNorm, BatchNormOp<DeviceType::CPU, float>);
 REGISTER_NEON_OPERATOR(BatchNorm, BatchNormOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 
+REGISTER_OPENCL_OPERATOR(BatchNorm, BatchNormOp<DeviceType::OPENCL, float>);
+
 }  //  namespace mace
\ No newline at end of file
diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h
index a7292601315c1db3d4d97659a154ad0bf91d474c..1452bc726c4807b079bb6dcb24b4319275059bc1 100644
--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -40,20 +40,7 @@ class BatchNormOp : public Operator<D, T> {
     Tensor *output = this->Output(0);
     output->ResizeLike(input);
 
-    const index_t n = input->dim(0);
-    const index_t channel = input->dim(1);
-    const index_t sample_size = input->dim(2) * input->dim(3);
-
-    const T *input_ptr = input->data<T>();
-    const T *scale_ptr = scale->data<T>();
-    const T *offset_ptr = offset->data<T>();
-    const T *mean_ptr = mean->data<T>();
-    const T *var_ptr = var->data<T>();
-    const T *epsilon_ptr = epsilon->data<T>();
-    T *output_ptr = output->mutable_data<T>();
-
-    functor_(input_ptr, scale_ptr, offset_ptr, mean_ptr, var_ptr, *epsilon_ptr,
-             n, channel, sample_size, output_ptr);
+    functor_(input, scale, offset, mean, var, epsilon, output);
     return true;
   }
 
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index 8fc2479794961001c18aac4b69f3ac767c78425c..3d17aca7ec153060629a551e4eba3829c22d4529 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -24,21 +24,23 @@ static void BatchNorm(
       .Finalize(net.operator_def());
 
   // Add input data
-  net.AddRandomInput<DeviceType::CPU, T>("Input", {batch, channels, height, width});
-  net.AddRandomInput<DeviceType::CPU, T>("Scale", {channels});
-  net.AddRandomInput<DeviceType::CPU, T>("Offset", {channels});
-  net.AddRandomInput<DeviceType::CPU, T>("Mean", {channels});
-  net.AddRandomInput<DeviceType::CPU, T>("Var", {channels}, true);
-  net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
+  net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
+  net.AddRandomInput<D, T>("Scale", {channels});
+  net.AddRandomInput<D, T>("Offset", {channels});
+  net.AddRandomInput<D, T>("Mean", {channels});
+  net.AddRandomInput<D, T>("Var", {channels}, true);
+  net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
     net.RunOp(D);
+    net.Sync();
   }
 
   mace::testing::StartTiming();
   while (iters--) {
     net.RunOp(D);
+    net.Sync();
   }
 }
 
@@ -54,7 +56,8 @@ static void BatchNorm(
 
 #define BM_BATCH_NORM(N, C, H, W, TYPE)       \
   BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, CPU); \
-  BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, NEON);
+  BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, NEON); \
+  BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, OPENCL);
 
 BM_BATCH_NORM(1, 1, 512, 512, float);
 BM_BATCH_NORM(1, 3, 128, 128, float);
diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc
index 9933877856754643334ebd0522bc505843a2814e..4c5d73bbe3981ec2af972a5c370ec2794c22ac2d 100644
--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -9,9 +9,10 @@ namespace mace {
 
 class BatchNormOpTest : public OpsTestBase {};
 
-TEST_F(BatchNormOpTest, SimpleCPU) {
+template <DeviceType D>
+void Simple() {
   // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
   OpDefBuilder("BatchNorm", "BatchNormTest")
       .Input("Input")
       .Input("Scale")
@@ -23,26 +24,79 @@ TEST_F(BatchNormOpTest, SimpleCPU) {
       .Finalize(net.operator_def());
 
   // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 1, 6, 2},
+  net.AddInputFromArray<D, float>("Input", {1, 1, 6, 2},
                                {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
-  net.AddInputFromArray<DeviceType::CPU, float>("Scale", {1}, {4.0f});
-  net.AddInputFromArray<DeviceType::CPU, float>("Offset", {1}, {2.0});
-  net.AddInputFromArray<DeviceType::CPU, float>("Mean", {1}, {10});
-  net.AddInputFromArray<DeviceType::CPU, float>("Var", {1}, {11.67f});
-  net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
+  net.AddInputFromArray<D, float>("Scale", {1}, {4.0f});
+  net.AddInputFromArray<D, float>("Offset", {1}, {2.0});
+  net.AddInputFromArray<D, float>("Mean", {1}, {10});
+  net.AddInputFromArray<D, float>("Var", {1}, {11.67f});
+  net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
 
   // Run
-  net.RunOp();
+  net.RunOp(D);
 
   // Check
   auto expected =
       CreateTensor<float>({1, 1, 6, 2}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
                                          3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
 
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.01);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
+}
+
+TEST_F(BatchNormOpTest, SimpleCPU) {
+  Simple<DeviceType::CPU>();
+}
+
+TEST_F(BatchNormOpTest, SimpleNEON) {
+  Simple<DeviceType::NEON>();
+}
+
+TEST_F(BatchNormOpTest, SimpleOPENCL) {
+  Simple<DeviceType::OPENCL>();
 }
 
-TEST_F(BatchNormOpTest, SimpleNeon) {
+TEST_F(BatchNormOpTest, SimpleRandomNeon) {
+  srand(time(NULL));
+
+  // generate random input
+  index_t batch = 1 + rand() % 10;
+  index_t channels = 3 + rand() % 50;
+  index_t height = 64;
+  index_t width = 64;
+  // Construct graph
+  auto &net = test_net();
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
+      .Input("Epsilon")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  // Add input data
+  net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, channels, height, width});
+  net.AddRandomInput<DeviceType::CPU, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::CPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::CPU, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::CPU, float>("Var", {channels}, true);
+  net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
+
+  // run cpu
+  net.RunOp();
+
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // Run NEON
+  net.RunOp(DeviceType::NEON);
+
+  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
+}
+
+TEST_F(BatchNormOpTest, ComplexRandomNeon) {
   srand(time(NULL));
 
   // generate random input
@@ -74,11 +128,96 @@ TEST_F(BatchNormOpTest, SimpleNeon) {
   net.RunOp();
 
   // Check
-  Tensor *expected = net.GetOutput("Output");
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
 
   // Run NEON
   net.RunOp(DeviceType::NEON);
 
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
 }
+
+TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
+  srand(time(NULL));
+
+  // generate random input
+  index_t batch = 1 + rand() % 10;
+  index_t channels = 3 + rand() % 50;
+  index_t height = 64;
+  index_t width = 64;
+  // Construct graph
+  auto &net = test_net();
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
+      .Input("Epsilon")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  // Add input data
+  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, channels, height, width});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
+  net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
+
+  // Run NEON
+  net.RunOp(DeviceType::OPENCL);
+
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // run cpu
+  net.RunOp();
+
+  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
+}
+
+TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
+  srand(time(NULL));
+
+  // generate random input
+  index_t batch = 1 + rand() % 10;
+  index_t channels = 3 + rand() % 50;
+  index_t height = 103;
+  index_t width = 113;
+  // Construct graph
+  auto &net = test_net();
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
+      .Input("Epsilon")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  // Add input data
+  net.AddRandomInput<DeviceType::OPENCL, float>("Input", {batch, channels, height, width});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
+  net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
+
+  // Run NEON
+  net.RunOp(DeviceType::OPENCL);
+  net.Sync();
+
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // run cpu
+  net.RunOp();
+
+  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
+}
+
 }
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 3b2ddfe0f69dcac9f624ff67907ad640a4f5505a..678f855fc7b98b5c7ec66f13e07e75ab121e067e 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -11,6 +11,7 @@
 #include "mace/core/common.h"
 #include "mace/core/net.h"
 #include "mace/core/tensor.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 
 namespace mace {
 
@@ -152,6 +153,12 @@ class OpsTestNet {
     return ws_.GetTensor(output_name);
   }
 
+  void Sync() {
+    if (net_) {
+      OpenCLRuntime::Get()->command_queue().finish();
+    }
+  }
+
  public:
   Workspace ws_;
   OperatorDef op_def_;