diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index f24333ac461e9cbff8708f4202ad0f7545e56eeb..79093fa177b2d7a7ee188e47c3e2a368c080c8a4 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -236,7 +236,7 @@ void GetAdrenoContextProperties(std::vector<cl_context_properties> *properties,
 
 OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
                              GPUPriorityHint gpu_priority_hint):
-    storage_(nullptr) {
+    storage_(nullptr), is_profiling_enabled_(false) {
   LoadOpenCLLibrary();
 
   std::vector<cl::Platform> all_platforms;
@@ -286,6 +286,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
   if (Tuner<uint32_t>::Get()->IsTuning() ||
       (profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) {
     properties |= CL_QUEUE_PROFILING_ENABLE;
+    is_profiling_enabled_ = true;
   }
 
   cl_int err;
@@ -590,4 +591,8 @@ const bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const {
   return out_of_range_check_;
 }
 
+const bool OpenCLRuntime::is_profiling_enabled() const {
+  return is_profiling_enabled_;
+}
+
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index c06564908e56ba794963a8a70314760f31b5afc0..35a20bff198fc52f0859f935663118bbde3b45ec 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -77,6 +77,7 @@ class OpenCLRuntime {
   const GPUType ParseGPUType(const std::string &device_name);
   const std::string ParseDeviceVersion(const std::string &device_version);
   void SaveBuiltCLProgram();
+  const bool is_profiling_enabled() const;
 
  private:
   OpenCLRuntime(GPUPerfHint, GPUPriorityHint);
@@ -116,6 +117,7 @@ class OpenCLRuntime {
   std::string platform_info_;
   bool program_map_changed_;
   std::unique_ptr<KVStorage> storage_;
+  bool is_profiling_enabled_;
 
   static GPUPerfHint kGPUPerfHint;
   static GPUPriorityHint kGPUPriorityHint;
diff --git a/mace/core/runtime/opencl/opencl_wrapper.cc b/mace/core/runtime/opencl/opencl_wrapper.cc
index b8cd709730d16aec1715e415bbeff4c552cd8787..c9b20577aecd2bf57b04be150b49ffc7d4a9cc05 100644
--- a/mace/core/runtime/opencl/opencl_wrapper.cc
+++ b/mace/core/runtime/opencl/opencl_wrapper.cc
@@ -168,6 +168,11 @@ class OpenCLLibraryImpl final {
                                                   size_t,
                                                   void *,
                                                   size_t *);
+  using clGetEventInfoFunc = cl_int (*)(cl_event event,
+                                        cl_event_info param_name,
+                                        size_t param_value_size,
+                                        void *param_value,
+                                        size_t *param_value_size_ret);
   using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event,
                                                  cl_profiling_info param_name,
                                                  size_t param_value_size,
@@ -221,6 +226,7 @@ class OpenCLLibraryImpl final {
   MACE_CL_DEFINE_FUNC_PTR(clReleaseDevice);
   MACE_CL_DEFINE_FUNC_PTR(clRetainEvent);
   MACE_CL_DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
+  MACE_CL_DEFINE_FUNC_PTR(clGetEventInfo);
   MACE_CL_DEFINE_FUNC_PTR(clGetEventProfilingInfo);
   MACE_CL_DEFINE_FUNC_PTR(clGetImageInfo);
 
@@ -344,6 +350,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
   MACE_CL_ASSIGN_FROM_DLSYM(clReleaseDevice);
   MACE_CL_ASSIGN_FROM_DLSYM(clRetainEvent);
   MACE_CL_ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
+  MACE_CL_ASSIGN_FROM_DLSYM(clGetEventInfo);
   MACE_CL_ASSIGN_FROM_DLSYM(clGetEventProfilingInfo);
   MACE_CL_ASSIGN_FROM_DLSYM(clGetImageInfo);
 
@@ -881,6 +888,21 @@ CL_API_ENTRY cl_int clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 {
   return func(event);
 }
 
+// Event API
+CL_API_ENTRY cl_int clGetEventInfo(cl_event event,
+                                   cl_event_info param_name,
+                                   size_t param_value_size,
+                                   void *param_value,
+                                   size_t *param_value_size_ret)
+CL_API_SUFFIX__VERSION_1_0 {
+  MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
+  auto func = mace::openclLibraryImpl->clGetEventInfo;
+  MACE_CHECK_NOTNULL(func);
+  MACE_LATENCY_LOGGER(3, "clGetEventInfo");
+  return func(event, param_name, param_value_size, param_value,
+              param_value_size_ret);
+}
+
 // Profiling APIs
 CL_API_ENTRY cl_int clGetEventProfilingInfo(cl_event event,
                                             cl_profiling_info param_name,
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index a2898edb26851fb8e98dea3b6d589c0f544b6c43..697ae85448dce15cbb036ac13b177f2359274022 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -137,6 +137,9 @@ static void ConcatN(cl::Kernel *kernel,
 
   const int inputs_count = input_list.size();
   index_t chan_blk_offset = 0;
+  cl::Event event;
+  CallStats call_stats{INT64_MAX, 0};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
   for (int i = 0; i < inputs_count; ++i) {
     const Tensor *input = input_list[i];
     index_t input_channel_blk = input->dim(3) / 4;
@@ -160,18 +163,45 @@ static void ConcatN(cl::Kernel *kernel,
     kernel->setArg(idx++, *(output->opencl_image()));
 
     chan_blk_offset += input_channel_blk;
-    const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
-    std::stringstream ss;
-    ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
-       << batch * height;
-    TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
+    cl_int error;
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          *kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    } else {
+      std::vector<uint32_t> roundup_gws(lws.size());
+      for (size_t j = 0; j < 3; ++j) {
+        roundup_gws[j] = RoundUp(gws[j], lws[j]);
+      }
 
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          *kernel, cl::NullRange,
+          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    }
+    MACE_CHECK_CL_SUCCESS(error);
     if (runtime->IsOutOfRangeCheckEnabled()) {
       (*kernel_error)->Map(nullptr);
       char *kerror_code = (*kernel_error)->mutable_data<char>();
       MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
       (*kernel_error)->UnMap();
     }
+    if (runtime->is_profiling_enabled()) {
+      CallStats tmp_stats;
+      runtime->GetCallStats(event, &tmp_stats);
+      call_stats.start_micros = std::min<int64_t>(tmp_stats.start_micros,
+                                                   call_stats.start_micros);
+      call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
+    }
+  }
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event, call_stats](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        stats->start_micros = call_stats.start_micros;
+        stats->end_micros = stats->start_micros + call_stats.end_micros;
+      }
+    };
   }
 }
 
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index 1945f14d135f8bebdf34d3e6d3b3ace5c75649eb..07f9086b74d2a819a79d7b906fe57576aff82d09 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -63,13 +63,8 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
   };
 
   const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
-  std::stringstream ss;
-  ss << "slice_opencl_kernel_"
-     << input->dim(0) << "_"
-     << input->dim(1) << "_"
-     << input->dim(2) << "_"
-     << input_channels << "_"
-     << outputs_count;
+  cl::Event event;
+  CallStats call_stats{INT64_MAX, 0};
   for (int i = 0; i < outputs_count; ++i) {
     uint32_t idx = 0;
     if (runtime->IsOutOfRangeCheckEnabled()) {
@@ -85,13 +80,45 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
     kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
 
-    TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+    cl_int error;
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    } else {
+      std::vector<uint32_t> roundup_gws(lws.size());
+      for (size_t j = 0; j < 3; ++j) {
+        roundup_gws[j] = RoundUp(gws[j], lws[j]);
+      }
+
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange,
+          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    }
+    MACE_CHECK_CL_SUCCESS(error);
     if (runtime->IsOutOfRangeCheckEnabled()) {
       kernel_error_->Map(nullptr);
       char *kerror_code = kernel_error_->mutable_data<char>();
       MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
       kernel_error_->UnMap();
     }
+    if (runtime->is_profiling_enabled()) {
+      CallStats tmp_stats;
+      runtime->GetCallStats(event, &tmp_stats);
+      call_stats.start_micros = std::min<int64_t>(tmp_stats.start_micros,
+                                                   call_stats.start_micros);
+      call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
+    }
+  }
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event, call_stats](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        stats->start_micros = call_stats.start_micros;
+        stats->end_micros = stats->start_micros + call_stats.end_micros;
+      }
+    };
   }
 }
 
diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc
index efc8bd7fc1bb4e59e92c0f14c11736011432586a..d45939577ec8b837cc4424159bc95891d3ee54f4 100644
--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -151,12 +151,17 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
   int num_inputs = shapes.size();
   int concat_axis_size = 0;
   // Construct graph
+  std::vector<std::vector<float>> inputs(num_inputs, std::vector<float>());
+  std::vector<const float*> input_ptrs(num_inputs);
   OpsTestNet net;
   for (int i = 0; i < num_inputs; ++i) {
     const std::string input_name = MakeString("Input", i);
     const std::string image_name = MakeString("InputImage", i);
     concat_axis_size += shapes[i][axis];
-    net.AddRandomInput<DeviceType::OPENCL, float>(input_name, shapes[i]);
+    GenerateRandomRealTypeData(shapes[i], &inputs[i]);
+    input_ptrs[i] = inputs[i].data();
+    net.AddInputFromArray<DeviceType::OPENCL, float>(input_name,
+                                                     shapes[i], inputs[i]);
     BufferToImage<DeviceType::OPENCL, T>(&net, input_name, image_name,
                                          kernels::BufferType::IN_OUT_CHANNEL);
   }
@@ -186,17 +191,15 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
 
   Tensor::MappingGuard output_mapper(output);
   const float *output_ptr = output->data<float>();
+  const float *output_ptr_end = output_ptr + output->size();
   int k = 0;
-  while (output_ptr != (output->data<float>() + output->size())) {
+  while (output_ptr != output_ptr_end) {
     for (int i = 0; i < num_inputs; ++i) {
       index_t num_elements =
           std::accumulate(shapes[i].begin() + axis, shapes[i].end(), 1,
                           std::multiplies<index_t>());
 
-      const std::string input_name = MakeString("Input", i);
-      const Tensor *input_tensor = net.GetTensor(input_name.data());
-      Tensor::MappingGuard input_guard(input_tensor);
-      const float *input_ptr = input_tensor->data<float>() + k * num_elements;
+      const float *input_ptr = input_ptrs[i] + k * num_elements;
       for (int j = 0; j < num_elements; ++j) {
         EXPECT_NEAR(*(input_ptr + j), *output_ptr++, 1e-2)
             << "With index: " << i << ", " << j;